diff options
Diffstat (limited to 'drivers/md')
63 files changed, 2241 insertions, 1656 deletions
diff --git a/drivers/md/Kconfig b/drivers/md/Kconfig index 35b1080752cd..1e9db8e4acdf 100644 --- a/drivers/md/Kconfig +++ b/drivers/md/Kconfig @@ -540,6 +540,16 @@ config DM_VERITY_VERIFY_ROOTHASH_SIG_SECONDARY_KEYRING If unsure, say N. +config DM_VERITY_VERIFY_ROOTHASH_SIG_PLATFORM_KEYRING + bool "Verity data device root hash signature verification with platform keyring" + default DM_VERITY_VERIFY_ROOTHASH_SIG_SECONDARY_KEYRING + depends on DM_VERITY_VERIFY_ROOTHASH_SIG + depends on INTEGRITY_PLATFORM_KEYRING + help + Rely also on the platform keyring to verify dm-verity signatures. + + If unsure, say N. + config DM_VERITY_FEC bool "Verity forward error correction support" depends on DM_VERITY diff --git a/drivers/md/bcache/alloc.c b/drivers/md/bcache/alloc.c index 48ce750bf70a..da50f6661bae 100644 --- a/drivers/md/bcache/alloc.c +++ b/drivers/md/bcache/alloc.c @@ -164,40 +164,68 @@ static void bch_invalidate_one_bucket(struct cache *ca, struct bucket *b) * prio is worth 1/8th of what INITIAL_PRIO is worth. */ -#define bucket_prio(b) \ -({ \ - unsigned int min_prio = (INITIAL_PRIO - ca->set->min_prio) / 8; \ - \ - (b->prio - ca->set->min_prio + min_prio) * GC_SECTORS_USED(b); \ -}) +static inline unsigned int new_bucket_prio(struct cache *ca, struct bucket *b) +{ + unsigned int min_prio = (INITIAL_PRIO - ca->set->min_prio) / 8; + + return (b->prio - ca->set->min_prio + min_prio) * GC_SECTORS_USED(b); +} + +static inline bool new_bucket_max_cmp(const void *l, const void *r, void *args) +{ + struct bucket **lhs = (struct bucket **)l; + struct bucket **rhs = (struct bucket **)r; + struct cache *ca = args; + + return new_bucket_prio(ca, *lhs) > new_bucket_prio(ca, *rhs); +} -#define bucket_max_cmp(l, r) (bucket_prio(l) < bucket_prio(r)) -#define bucket_min_cmp(l, r) (bucket_prio(l) > bucket_prio(r)) +static inline bool new_bucket_min_cmp(const void *l, const void *r, void *args) +{ + struct bucket **lhs = (struct bucket **)l; + struct bucket **rhs = (struct bucket **)r; + struct cache *ca = args; + + return new_bucket_prio(ca, *lhs) < new_bucket_prio(ca, *rhs); +} + +static inline void new_bucket_swap(void *l, void *r, void __always_unused *args) +{ + struct bucket **lhs = l, **rhs = r; + + swap(*lhs, *rhs); +} static void invalidate_buckets_lru(struct cache *ca) { struct bucket *b; - ssize_t i; + const struct min_heap_callbacks bucket_max_cmp_callback = { + .less = new_bucket_max_cmp, + .swp = new_bucket_swap, + }; + const struct min_heap_callbacks bucket_min_cmp_callback = { + .less = new_bucket_min_cmp, + .swp = new_bucket_swap, + }; - ca->heap.used = 0; + ca->heap.nr = 0; for_each_bucket(b, ca) { if (!bch_can_invalidate_bucket(ca, b)) continue; - if (!heap_full(&ca->heap)) - heap_add(&ca->heap, b, bucket_max_cmp); - else if (bucket_max_cmp(b, heap_peek(&ca->heap))) { + if (!min_heap_full(&ca->heap)) + min_heap_push(&ca->heap, &b, &bucket_max_cmp_callback, ca); + else if (!new_bucket_max_cmp(&b, min_heap_peek(&ca->heap), ca)) { ca->heap.data[0] = b; - heap_sift(&ca->heap, 0, bucket_max_cmp); + min_heap_sift_down(&ca->heap, 0, &bucket_max_cmp_callback, ca); } } - for (i = ca->heap.used / 2 - 1; i >= 0; --i) - heap_sift(&ca->heap, i, bucket_min_cmp); + min_heapify_all(&ca->heap, &bucket_min_cmp_callback, ca); while (!fifo_full(&ca->free_inc)) { - if (!heap_pop(&ca->heap, b, bucket_min_cmp)) { + if (!ca->heap.nr) { /* * We don't want to be calling invalidate_buckets() * multiple times when it can't do anything @@ -206,6 +234,8 @@ static void invalidate_buckets_lru(struct cache *ca) wake_up_gc(ca->set); return; } + b = min_heap_peek(&ca->heap)[0]; + min_heap_pop(&ca->heap, &bucket_min_cmp_callback, ca); bch_invalidate_one_bucket(ca, b); } diff --git a/drivers/md/bcache/bcache.h b/drivers/md/bcache/bcache.h index 1d33e40d26ea..785b0d9008fa 100644 --- a/drivers/md/bcache/bcache.h +++ b/drivers/md/bcache/bcache.h @@ -458,7 +458,7 @@ struct cache { /* Allocation stuff: */ struct bucket *buckets; - DECLARE_HEAP(struct bucket *, heap); + DEFINE_MIN_HEAP(struct bucket *, cache_heap) heap; /* * If nonzero, we know we aren't going to find any buckets to invalidate diff --git a/drivers/md/bcache/bset.c b/drivers/md/bcache/bset.c index 463eb13bd0b2..bd97d8626887 100644 --- a/drivers/md/bcache/bset.c +++ b/drivers/md/bcache/bset.c @@ -54,9 +54,11 @@ void bch_dump_bucket(struct btree_keys *b) int __bch_count_data(struct btree_keys *b) { unsigned int ret = 0; - struct btree_iter_stack iter; + struct btree_iter iter; struct bkey *k; + min_heap_init(&iter.heap, NULL, MAX_BSETS); + if (b->ops->is_extents) for_each_key(b, k, &iter) ret += KEY_SIZE(k); @@ -67,9 +69,11 @@ void __bch_check_keys(struct btree_keys *b, const char *fmt, ...) { va_list args; struct bkey *k, *p = NULL; - struct btree_iter_stack iter; + struct btree_iter iter; const char *err; + min_heap_init(&iter.heap, NULL, MAX_BSETS); + for_each_key(b, k, &iter) { if (b->ops->is_extents) { err = "Keys out of order"; @@ -110,9 +114,9 @@ bug: static void bch_btree_iter_next_check(struct btree_iter *iter) { - struct bkey *k = iter->data->k, *next = bkey_next(k); + struct bkey *k = iter->heap.data->k, *next = bkey_next(k); - if (next < iter->data->end && + if (next < iter->heap.data->end && bkey_cmp(k, iter->b->ops->is_extents ? &START_KEY(next) : next) > 0) { bch_dump_bucket(iter->b); @@ -879,12 +883,14 @@ unsigned int bch_btree_insert_key(struct btree_keys *b, struct bkey *k, unsigned int status = BTREE_INSERT_STATUS_NO_INSERT; struct bset *i = bset_tree_last(b)->data; struct bkey *m, *prev = NULL; - struct btree_iter_stack iter; + struct btree_iter iter; struct bkey preceding_key_on_stack = ZERO_KEY; struct bkey *preceding_key_p = &preceding_key_on_stack; BUG_ON(b->ops->is_extents && !KEY_SIZE(k)); + min_heap_init(&iter.heap, NULL, MAX_BSETS); + /* * If k has preceding key, preceding_key_p will be set to address * of k's preceding key; otherwise preceding_key_p will be set @@ -895,9 +901,9 @@ unsigned int bch_btree_insert_key(struct btree_keys *b, struct bkey *k, else preceding_key(k, &preceding_key_p); - m = bch_btree_iter_stack_init(b, &iter, preceding_key_p); + m = bch_btree_iter_init(b, &iter, preceding_key_p); - if (b->ops->insert_fixup(b, k, &iter.iter, replace_key)) + if (b->ops->insert_fixup(b, k, &iter, replace_key)) return status; status = BTREE_INSERT_STATUS_INSERT; @@ -1077,79 +1083,102 @@ struct bkey *__bch_bset_search(struct btree_keys *b, struct bset_tree *t, /* Btree iterator */ -typedef bool (btree_iter_cmp_fn)(struct btree_iter_set, - struct btree_iter_set); +typedef bool (new_btree_iter_cmp_fn)(const void *, const void *, void *); + +static inline bool new_btree_iter_cmp(const void *l, const void *r, void __always_unused *args) +{ + const struct btree_iter_set *_l = l; + const struct btree_iter_set *_r = r; + + return bkey_cmp(_l->k, _r->k) <= 0; +} -static inline bool btree_iter_cmp(struct btree_iter_set l, - struct btree_iter_set r) +static inline void new_btree_iter_swap(void *iter1, void *iter2, void __always_unused *args) { - return bkey_cmp(l.k, r.k) > 0; + struct btree_iter_set *_iter1 = iter1; + struct btree_iter_set *_iter2 = iter2; + + swap(*_iter1, *_iter2); } static inline bool btree_iter_end(struct btree_iter *iter) { - return !iter->used; + return !iter->heap.nr; } void bch_btree_iter_push(struct btree_iter *iter, struct bkey *k, struct bkey *end) { + const struct min_heap_callbacks callbacks = { + .less = new_btree_iter_cmp, + .swp = new_btree_iter_swap, + }; + if (k != end) - BUG_ON(!heap_add(iter, - ((struct btree_iter_set) { k, end }), - btree_iter_cmp)); + BUG_ON(!min_heap_push(&iter->heap, + &((struct btree_iter_set) { k, end }), + &callbacks, + NULL)); } -static struct bkey *__bch_btree_iter_stack_init(struct btree_keys *b, - struct btree_iter_stack *iter, - struct bkey *search, - struct bset_tree *start) +static struct bkey *__bch_btree_iter_init(struct btree_keys *b, + struct btree_iter *iter, + struct bkey *search, + struct bset_tree *start) { struct bkey *ret = NULL; - iter->iter.size = ARRAY_SIZE(iter->stack_data); - iter->iter.used = 0; + iter->heap.size = ARRAY_SIZE(iter->heap.preallocated); + iter->heap.nr = 0; #ifdef CONFIG_BCACHE_DEBUG - iter->iter.b = b; + iter->b = b; #endif for (; start <= bset_tree_last(b); start++) { ret = bch_bset_search(b, start, search); - bch_btree_iter_push(&iter->iter, ret, bset_bkey_last(start->data)); + bch_btree_iter_push(iter, ret, bset_bkey_last(start->data)); } return ret; } -struct bkey *bch_btree_iter_stack_init(struct btree_keys *b, - struct btree_iter_stack *iter, +struct bkey *bch_btree_iter_init(struct btree_keys *b, + struct btree_iter *iter, struct bkey *search) { - return __bch_btree_iter_stack_init(b, iter, search, b->set); + return __bch_btree_iter_init(b, iter, search, b->set); } static inline struct bkey *__bch_btree_iter_next(struct btree_iter *iter, - btree_iter_cmp_fn *cmp) + new_btree_iter_cmp_fn *cmp) { struct btree_iter_set b __maybe_unused; struct bkey *ret = NULL; + const struct min_heap_callbacks callbacks = { + .less = cmp, + .swp = new_btree_iter_swap, + }; if (!btree_iter_end(iter)) { bch_btree_iter_next_check(iter); - ret = iter->data->k; - iter->data->k = bkey_next(iter->data->k); + ret = iter->heap.data->k; + iter->heap.data->k = bkey_next(iter->heap.data->k); - if (iter->data->k > iter->data->end) { + if (iter->heap.data->k > iter->heap.data->end) { WARN_ONCE(1, "bset was corrupt!\n"); - iter->data->k = iter->data->end; + iter->heap.data->k = iter->heap.data->end; } - if (iter->data->k == iter->data->end) - heap_pop(iter, b, cmp); + if (iter->heap.data->k == iter->heap.data->end) { + if (iter->heap.nr) { + b = min_heap_peek(&iter->heap)[0]; + min_heap_pop(&iter->heap, &callbacks, NULL); + } + } else - heap_sift(iter, 0, cmp); + min_heap_sift_down(&iter->heap, 0, &callbacks, NULL); } return ret; @@ -1157,7 +1186,7 @@ static inline struct bkey *__bch_btree_iter_next(struct btree_iter *iter, struct bkey *bch_btree_iter_next(struct btree_iter *iter) { - return __bch_btree_iter_next(iter, btree_iter_cmp); + return __bch_btree_iter_next(iter, new_btree_iter_cmp); } @@ -1195,16 +1224,18 @@ static void btree_mergesort(struct btree_keys *b, struct bset *out, struct btree_iter *iter, bool fixup, bool remove_stale) { - int i; struct bkey *k, *last = NULL; BKEY_PADDED(k) tmp; bool (*bad)(struct btree_keys *, const struct bkey *) = remove_stale ? bch_ptr_bad : bch_ptr_invalid; + const struct min_heap_callbacks callbacks = { + .less = b->ops->sort_cmp, + .swp = new_btree_iter_swap, + }; /* Heapify the iterator, using our comparison function */ - for (i = iter->used / 2 - 1; i >= 0; --i) - heap_sift(iter, i, b->ops->sort_cmp); + min_heapify_all(&iter->heap, &callbacks, NULL); while (!btree_iter_end(iter)) { if (b->ops->sort_fixup && fixup) @@ -1293,10 +1324,11 @@ void bch_btree_sort_partial(struct btree_keys *b, unsigned int start, struct bset_sort_state *state) { size_t order = b->page_order, keys = 0; - struct btree_iter_stack iter; + struct btree_iter iter; int oldsize = bch_count_data(b); - __bch_btree_iter_stack_init(b, &iter, NULL, &b->set[start]); + min_heap_init(&iter.heap, NULL, MAX_BSETS); + __bch_btree_iter_init(b, &iter, NULL, &b->set[start]); if (start) { unsigned int i; @@ -1307,7 +1339,7 @@ void bch_btree_sort_partial(struct btree_keys *b, unsigned int start, order = get_order(__set_bytes(b->set->data, keys)); } - __btree_sort(b, &iter.iter, start, order, false, state); + __btree_sort(b, &iter, start, order, false, state); EBUG_ON(oldsize >= 0 && bch_count_data(b) != oldsize); } @@ -1323,11 +1355,13 @@ void bch_btree_sort_into(struct btree_keys *b, struct btree_keys *new, struct bset_sort_state *state) { uint64_t start_time = local_clock(); - struct btree_iter_stack iter; + struct btree_iter iter; + + min_heap_init(&iter.heap, NULL, MAX_BSETS); - bch_btree_iter_stack_init(b, &iter, NULL); + bch_btree_iter_init(b, &iter, NULL); - btree_mergesort(b, new->set->data, &iter.iter, false, true); + btree_mergesort(b, new->set->data, &iter, false, true); bch_time_stats_update(&state->time, start_time); diff --git a/drivers/md/bcache/bset.h b/drivers/md/bcache/bset.h index 011f6062c4c0..f79441acd4c1 100644 --- a/drivers/md/bcache/bset.h +++ b/drivers/md/bcache/bset.h @@ -187,8 +187,9 @@ struct bset_tree { }; struct btree_keys_ops { - bool (*sort_cmp)(struct btree_iter_set l, - struct btree_iter_set r); + bool (*sort_cmp)(const void *l, + const void *r, + void *args); struct bkey *(*sort_fixup)(struct btree_iter *iter, struct bkey *tmp); bool (*insert_fixup)(struct btree_keys *b, @@ -312,23 +313,17 @@ enum { BTREE_INSERT_STATUS_FRONT_MERGE, }; +struct btree_iter_set { + struct bkey *k, *end; +}; + /* Btree key iteration */ struct btree_iter { - size_t size, used; #ifdef CONFIG_BCACHE_DEBUG struct btree_keys *b; #endif - struct btree_iter_set { - struct bkey *k, *end; - } data[]; -}; - -/* Fixed-size btree_iter that can be allocated on the stack */ - -struct btree_iter_stack { - struct btree_iter iter; - struct btree_iter_set stack_data[MAX_BSETS]; + MIN_HEAP_PREALLOCATED(struct btree_iter_set, btree_iter_heap, MAX_BSETS) heap; }; typedef bool (*ptr_filter_fn)(struct btree_keys *b, const struct bkey *k); @@ -340,9 +335,9 @@ struct bkey *bch_btree_iter_next_filter(struct btree_iter *iter, void bch_btree_iter_push(struct btree_iter *iter, struct bkey *k, struct bkey *end); -struct bkey *bch_btree_iter_stack_init(struct btree_keys *b, - struct btree_iter_stack *iter, - struct bkey *search); +struct bkey *bch_btree_iter_init(struct btree_keys *b, + struct btree_iter *iter, + struct bkey *search); struct bkey *__bch_bset_search(struct btree_keys *b, struct bset_tree *t, const struct bkey *search); @@ -357,14 +352,13 @@ static inline struct bkey *bch_bset_search(struct btree_keys *b, return search ? __bch_bset_search(b, t, search) : t->data->start; } -#define for_each_key_filter(b, k, stack_iter, filter) \ - for (bch_btree_iter_stack_init((b), (stack_iter), NULL); \ - ((k) = bch_btree_iter_next_filter(&((stack_iter)->iter), (b), \ - filter));) +#define for_each_key_filter(b, k, iter, filter) \ + for (bch_btree_iter_init((b), (iter), NULL); \ + ((k) = bch_btree_iter_next_filter((iter), (b), filter));) -#define for_each_key(b, k, stack_iter) \ - for (bch_btree_iter_stack_init((b), (stack_iter), NULL); \ - ((k) = bch_btree_iter_next(&((stack_iter)->iter)));) +#define for_each_key(b, k, iter) \ + for (bch_btree_iter_init((b), (iter), NULL); \ + ((k) = bch_btree_iter_next(iter));) /* Sorting */ diff --git a/drivers/md/bcache/btree.c b/drivers/md/bcache/btree.c index 4e6ccf2c8a0b..ed40d8600656 100644 --- a/drivers/md/bcache/btree.c +++ b/drivers/md/bcache/btree.c @@ -149,19 +149,19 @@ void bch_btree_node_read_done(struct btree *b) { const char *err = "bad btree header"; struct bset *i = btree_bset_first(b); - struct btree_iter *iter; + struct btree_iter iter; /* * c->fill_iter can allocate an iterator with more memory space * than static MAX_BSETS. * See the comment arount cache_set->fill_iter. */ - iter = mempool_alloc(&b->c->fill_iter, GFP_NOIO); - iter->size = b->c->cache->sb.bucket_size / b->c->cache->sb.block_size; - iter->used = 0; + iter.heap.data = mempool_alloc(&b->c->fill_iter, GFP_NOIO); + iter.heap.size = b->c->cache->sb.bucket_size / b->c->cache->sb.block_size; + iter.heap.nr = 0; #ifdef CONFIG_BCACHE_DEBUG - iter->b = &b->keys; + iter.b = &b->keys; #endif if (!i->seq) @@ -199,7 +199,7 @@ void bch_btree_node_read_done(struct btree *b) if (i != b->keys.set[0].data && !i->keys) goto err; - bch_btree_iter_push(iter, i->start, bset_bkey_last(i)); + bch_btree_iter_push(&iter, i->start, bset_bkey_last(i)); b->written += set_blocks(i, block_bytes(b->c->cache)); } @@ -211,7 +211,7 @@ void bch_btree_node_read_done(struct btree *b) if (i->seq == b->keys.set[0].data->seq) goto err; - bch_btree_sort_and_fix_extents(&b->keys, iter, &b->c->sort); + bch_btree_sort_and_fix_extents(&b->keys, &iter, &b->c->sort); i = b->keys.set[0].data; err = "short btree key"; @@ -223,7 +223,7 @@ void bch_btree_node_read_done(struct btree *b) bch_bset_init_next(&b->keys, write_block(b), bset_magic(&b->c->cache->sb)); out: - mempool_free(iter, &b->c->fill_iter); + mempool_free(iter.heap.data, &b->c->fill_iter); return; err: set_btree_node_io_error(b); @@ -1309,9 +1309,11 @@ static bool btree_gc_mark_node(struct btree *b, struct gc_stat *gc) uint8_t stale = 0; unsigned int keys = 0, good_keys = 0; struct bkey *k; - struct btree_iter_stack iter; + struct btree_iter iter; struct bset_tree *t; + min_heap_init(&iter.heap, NULL, MAX_BSETS); + gc->nodes++; for_each_key_filter(&b->keys, k, &iter, bch_ptr_invalid) { @@ -1570,9 +1572,11 @@ static int btree_gc_rewrite_node(struct btree *b, struct btree_op *op, static unsigned int btree_gc_count_keys(struct btree *b) { struct bkey *k; - struct btree_iter_stack iter; + struct btree_iter iter; unsigned int ret = 0; + min_heap_init(&iter.heap, NULL, MAX_BSETS); + for_each_key_filter(&b->keys, k, &iter, bch_ptr_bad) ret += bkey_u64s(k); @@ -1611,18 +1615,18 @@ static int btree_gc_recurse(struct btree *b, struct btree_op *op, int ret = 0; bool should_rewrite; struct bkey *k; - struct btree_iter_stack iter; + struct btree_iter iter; struct gc_merge_info r[GC_MERGE_NODES]; struct gc_merge_info *i, *last = r + ARRAY_SIZE(r) - 1; - bch_btree_iter_stack_init(&b->keys, &iter, &b->c->gc_done); + min_heap_init(&iter.heap, NULL, MAX_BSETS); + bch_btree_iter_init(&b->keys, &iter, &b->c->gc_done); for (i = r; i < r + ARRAY_SIZE(r); i++) i->b = ERR_PTR(-EINTR); while (1) { - k = bch_btree_iter_next_filter(&iter.iter, &b->keys, - bch_ptr_bad); + k = bch_btree_iter_next_filter(&iter, &b->keys, bch_ptr_bad); if (k) { r->b = bch_btree_node_get(b->c, op, k, b->level - 1, true, b); @@ -1917,7 +1921,9 @@ static int bch_btree_check_recurse(struct btree *b, struct btree_op *op) { int ret = 0; struct bkey *k, *p = NULL; - struct btree_iter_stack iter; + struct btree_iter iter; + + min_heap_init(&iter.heap, NULL, MAX_BSETS); for_each_key_filter(&b->keys, k, &iter, bch_ptr_invalid) bch_initial_mark_key(b->c, b->level, k); @@ -1925,10 +1931,10 @@ static int bch_btree_check_recurse(struct btree *b, struct btree_op *op) bch_initial_mark_key(b->c, b->level + 1, &b->key); if (b->level) { - bch_btree_iter_stack_init(&b->keys, &iter, NULL); + bch_btree_iter_init(&b->keys, &iter, NULL); do { - k = bch_btree_iter_next_filter(&iter.iter, &b->keys, + k = bch_btree_iter_next_filter(&iter, &b->keys, bch_ptr_bad); if (k) { btree_node_prefetch(b, k); @@ -1956,7 +1962,7 @@ static int bch_btree_check_thread(void *arg) struct btree_check_info *info = arg; struct btree_check_state *check_state = info->state; struct cache_set *c = check_state->c; - struct btree_iter_stack iter; + struct btree_iter iter; struct bkey *k, *p; int cur_idx, prev_idx, skip_nr; @@ -1964,9 +1970,11 @@ static int bch_btree_check_thread(void *arg) cur_idx = prev_idx = 0; ret = 0; + min_heap_init(&iter.heap, NULL, MAX_BSETS); + /* root node keys are checked before thread created */ - bch_btree_iter_stack_init(&c->root->keys, &iter, NULL); - k = bch_btree_iter_next_filter(&iter.iter, &c->root->keys, bch_ptr_bad); + bch_btree_iter_init(&c->root->keys, &iter, NULL); + k = bch_btree_iter_next_filter(&iter, &c->root->keys, bch_ptr_bad); BUG_ON(!k); p = k; @@ -1984,7 +1992,7 @@ static int bch_btree_check_thread(void *arg) skip_nr = cur_idx - prev_idx; while (skip_nr) { - k = bch_btree_iter_next_filter(&iter.iter, + k = bch_btree_iter_next_filter(&iter, &c->root->keys, bch_ptr_bad); if (k) @@ -2057,9 +2065,11 @@ int bch_btree_check(struct cache_set *c) int ret = 0; int i; struct bkey *k = NULL; - struct btree_iter_stack iter; + struct btree_iter iter; struct btree_check_state check_state; + min_heap_init(&iter.heap, NULL, MAX_BSETS); + /* check and mark root node keys */ for_each_key_filter(&c->root->keys, k, &iter, bch_ptr_invalid) bch_initial_mark_key(c, c->root->level, k); @@ -2553,11 +2563,12 @@ static int bch_btree_map_nodes_recurse(struct btree *b, struct btree_op *op, if (b->level) { struct bkey *k; - struct btree_iter_stack iter; + struct btree_iter iter; - bch_btree_iter_stack_init(&b->keys, &iter, from); + min_heap_init(&iter.heap, NULL, MAX_BSETS); + bch_btree_iter_init(&b->keys, &iter, from); - while ((k = bch_btree_iter_next_filter(&iter.iter, &b->keys, + while ((k = bch_btree_iter_next_filter(&iter, &b->keys, bch_ptr_bad))) { ret = bcache_btree(map_nodes_recurse, k, b, op, from, fn, flags); @@ -2586,12 +2597,12 @@ int bch_btree_map_keys_recurse(struct btree *b, struct btree_op *op, { int ret = MAP_CONTINUE; struct bkey *k; - struct btree_iter_stack iter; + struct btree_iter iter; - bch_btree_iter_stack_init(&b->keys, &iter, from); + min_heap_init(&iter.heap, NULL, MAX_BSETS); + bch_btree_iter_init(&b->keys, &iter, from); - while ((k = bch_btree_iter_next_filter(&iter.iter, &b->keys, - bch_ptr_bad))) { + while ((k = bch_btree_iter_next_filter(&iter, &b->keys, bch_ptr_bad))) { ret = !b->level ? fn(op, b, k) : bcache_btree(map_keys_recurse, k, diff --git a/drivers/md/bcache/extents.c b/drivers/md/bcache/extents.c index d626ffcbecb9..a7221e5dbe81 100644 --- a/drivers/md/bcache/extents.c +++ b/drivers/md/bcache/extents.c @@ -33,15 +33,16 @@ static void sort_key_next(struct btree_iter *iter, i->k = bkey_next(i->k); if (i->k == i->end) - *i = iter->data[--iter->used]; + *i = iter->heap.data[--iter->heap.nr]; } -static bool bch_key_sort_cmp(struct btree_iter_set l, - struct btree_iter_set r) +static bool new_bch_key_sort_cmp(const void *l, const void *r, void *args) { - int64_t c = bkey_cmp(l.k, r.k); + struct btree_iter_set *_l = (struct btree_iter_set *)l; + struct btree_iter_set *_r = (struct btree_iter_set *)r; + int64_t c = bkey_cmp(_l->k, _r->k); - return c ? c > 0 : l.k < r.k; + return !(c ? c > 0 : _l->k < _r->k); } static bool __ptr_invalid(struct cache_set *c, const struct bkey *k) @@ -238,7 +239,7 @@ static bool bch_btree_ptr_insert_fixup(struct btree_keys *bk, } const struct btree_keys_ops bch_btree_keys_ops = { - .sort_cmp = bch_key_sort_cmp, + .sort_cmp = new_bch_key_sort_cmp, .insert_fixup = bch_btree_ptr_insert_fixup, .key_invalid = bch_btree_ptr_invalid, .key_bad = bch_btree_ptr_bad, @@ -255,22 +256,36 @@ const struct btree_keys_ops bch_btree_keys_ops = { * Necessary for btree_sort_fixup() - if there are multiple keys that compare * equal in different sets, we have to process them newest to oldest. */ -static bool bch_extent_sort_cmp(struct btree_iter_set l, - struct btree_iter_set r) + +static bool new_bch_extent_sort_cmp(const void *l, const void *r, void __always_unused *args) +{ + struct btree_iter_set *_l = (struct btree_iter_set *)l; + struct btree_iter_set *_r = (struct btree_iter_set *)r; + int64_t c = bkey_cmp(&START_KEY(_l->k), &START_KEY(_r->k)); + + return !(c ? c > 0 : _l->k < _r->k); +} + +static inline void new_btree_iter_swap(void *iter1, void *iter2, void __always_unused *args) { - int64_t c = bkey_cmp(&START_KEY(l.k), &START_KEY(r.k)); + struct btree_iter_set *_iter1 = iter1; + struct btree_iter_set *_iter2 = iter2; - return c ? c > 0 : l.k < r.k; + swap(*_iter1, *_iter2); } static struct bkey *bch_extent_sort_fixup(struct btree_iter *iter, struct bkey *tmp) { - while (iter->used > 1) { - struct btree_iter_set *top = iter->data, *i = top + 1; - - if (iter->used > 2 && - bch_extent_sort_cmp(i[0], i[1])) + const struct min_heap_callbacks callbacks = { + .less = new_bch_extent_sort_cmp, + .swp = new_btree_iter_swap, + }; + while (iter->heap.nr > 1) { + struct btree_iter_set *top = iter->heap.data, *i = top + 1; + + if (iter->heap.nr > 2 && + !new_bch_extent_sort_cmp(&i[0], &i[1], NULL)) i++; if (bkey_cmp(top->k, &START_KEY(i->k)) <= 0) @@ -278,7 +293,7 @@ static struct bkey *bch_extent_sort_fixup(struct btree_iter *iter, if (!KEY_SIZE(i->k)) { sort_key_next(iter, i); - heap_sift(iter, i - top, bch_extent_sort_cmp); + min_heap_sift_down(&iter->heap, i - top, &callbacks, NULL); continue; } @@ -288,7 +303,7 @@ static struct bkey *bch_extent_sort_fixup(struct btree_iter *iter, else bch_cut_front(top->k, i->k); - heap_sift(iter, i - top, bch_extent_sort_cmp); + min_heap_sift_down(&iter->heap, i - top, &callbacks, NULL); } else { /* can't happen because of comparison func */ BUG_ON(!bkey_cmp(&START_KEY(top->k), &START_KEY(i->k))); @@ -298,7 +313,7 @@ static struct bkey *bch_extent_sort_fixup(struct btree_iter *iter, bch_cut_back(&START_KEY(i->k), tmp); bch_cut_front(i->k, top->k); - heap_sift(iter, 0, bch_extent_sort_cmp); + min_heap_sift_down(&iter->heap, 0, &callbacks, NULL); return tmp; } else { @@ -618,7 +633,7 @@ static bool bch_extent_merge(struct btree_keys *bk, } const struct btree_keys_ops bch_extent_keys_ops = { - .sort_cmp = bch_extent_sort_cmp, + .sort_cmp = new_bch_extent_sort_cmp, .sort_fixup = bch_extent_sort_fixup, .insert_fixup = bch_extent_insert_fixup, .key_invalid = bch_extent_invalid, diff --git a/drivers/md/bcache/movinggc.c b/drivers/md/bcache/movinggc.c index ebd500bdf0b2..7f482729c56d 100644 --- a/drivers/md/bcache/movinggc.c +++ b/drivers/md/bcache/movinggc.c @@ -182,16 +182,27 @@ err: if (!IS_ERR_OR_NULL(w->private)) closure_sync(&cl); } -static bool bucket_cmp(struct bucket *l, struct bucket *r) +static bool new_bucket_cmp(const void *l, const void *r, void __always_unused *args) { - return GC_SECTORS_USED(l) < GC_SECTORS_USED(r); + struct bucket **_l = (struct bucket **)l; + struct bucket **_r = (struct bucket **)r; + + return GC_SECTORS_USED(*_l) >= GC_SECTORS_USED(*_r); +} + +static void new_bucket_swap(void *l, void *r, void __always_unused *args) +{ + struct bucket **_l = l; + struct bucket **_r = r; + + swap(*_l, *_r); } static unsigned int bucket_heap_top(struct cache *ca) { struct bucket *b; - return (b = heap_peek(&ca->heap)) ? GC_SECTORS_USED(b) : 0; + return (b = min_heap_peek(&ca->heap)[0]) ? GC_SECTORS_USED(b) : 0; } void bch_moving_gc(struct cache_set *c) @@ -199,6 +210,10 @@ void bch_moving_gc(struct cache_set *c) struct cache *ca = c->cache; struct bucket *b; unsigned long sectors_to_move, reserve_sectors; + const struct min_heap_callbacks callbacks = { + .less = new_bucket_cmp, + .swp = new_bucket_swap, + }; if (!c->copy_gc_enabled) return; @@ -209,7 +224,7 @@ void bch_moving_gc(struct cache_set *c) reserve_sectors = ca->sb.bucket_size * fifo_used(&ca->free[RESERVE_MOVINGGC]); - ca->heap.used = 0; + ca->heap.nr = 0; for_each_bucket(b, ca) { if (GC_MARK(b) == GC_MARK_METADATA || @@ -218,25 +233,31 @@ void bch_moving_gc(struct cache_set *c) atomic_read(&b->pin)) continue; - if (!heap_full(&ca->heap)) { + if (!min_heap_full(&ca->heap)) { sectors_to_move += GC_SECTORS_USED(b); - heap_add(&ca->heap, b, bucket_cmp); - } else if (bucket_cmp(b, heap_peek(&ca->heap))) { + min_heap_push(&ca->heap, &b, &callbacks, NULL); + } else if (!new_bucket_cmp(&b, min_heap_peek(&ca->heap), ca)) { sectors_to_move -= bucket_heap_top(ca); sectors_to_move += GC_SECTORS_USED(b); ca->heap.data[0] = b; - heap_sift(&ca->heap, 0, bucket_cmp); + min_heap_sift_down(&ca->heap, 0, &callbacks, NULL); } } while (sectors_to_move > reserve_sectors) { - heap_pop(&ca->heap, b, bucket_cmp); + if (ca->heap.nr) { + b = min_heap_peek(&ca->heap)[0]; + min_heap_pop(&ca->heap, &callbacks, NULL); + } sectors_to_move -= GC_SECTORS_USED(b); } - while (heap_pop(&ca->heap, b, bucket_cmp)) + while (ca->heap.nr) { + b = min_heap_peek(&ca->heap)[0]; + min_heap_pop(&ca->heap, &callbacks, NULL); SET_GC_MOVE(b, 1); + } mutex_unlock(&c->bucket_lock); diff --git a/drivers/md/bcache/super.c b/drivers/md/bcache/super.c index 4d11fc664cb0..e7abfdd77c3b 100644 --- a/drivers/md/bcache/super.c +++ b/drivers/md/bcache/super.c @@ -897,7 +897,6 @@ static int bcache_device_init(struct bcache_device *d, unsigned int block_size, sector_t sectors, struct block_device *cached_bdev, const struct block_device_operations *ops) { - struct request_queue *q; const size_t max_stripes = min_t(size_t, INT_MAX, SIZE_MAX / sizeof(atomic_t)); struct queue_limits lim = { @@ -909,6 +908,7 @@ static int bcache_device_init(struct bcache_device *d, unsigned int block_size, .io_min = block_size, .logical_block_size = block_size, .physical_block_size = block_size, + .features = BLK_FEAT_WRITE_CACHE | BLK_FEAT_FUA, }; uint64_t n; int idx; @@ -974,13 +974,6 @@ static int bcache_device_init(struct bcache_device *d, unsigned int block_size, d->disk->minors = BCACHE_MINORS; d->disk->fops = ops; d->disk->private_data = d; - - q = d->disk->queue; - - blk_queue_flag_set(QUEUE_FLAG_NONROT, d->disk->queue); - - blk_queue_write_cache(q, true, true); - return 0; out_bioset_exit: @@ -1423,8 +1416,8 @@ static int cached_dev_init(struct cached_dev *dc, unsigned int block_size) } if (bdev_io_opt(dc->bdev)) - dc->partial_stripes_expensive = - q->limits.raid_partial_stripes_expensive; + dc->partial_stripes_expensive = !!(q->limits.features & + BLK_FEAT_RAID_PARTIAL_STRIPES_EXPENSIVE); ret = bcache_device_init(&dc->disk, block_size, bdev_nr_sectors(dc->bdev) - dc->sb.data_offset, @@ -1914,8 +1907,7 @@ struct cache_set *bch_cache_set_alloc(struct cache_sb *sb) INIT_LIST_HEAD(&c->btree_cache_freed); INIT_LIST_HEAD(&c->data_buckets); - iter_size = sizeof(struct btree_iter) + - ((meta_bucket_pages(sb) * PAGE_SECTORS) / sb->block_size) * + iter_size = ((meta_bucket_pages(sb) * PAGE_SECTORS) / sb->block_size) * sizeof(struct btree_iter_set); c->devices = kcalloc(c->nr_uuids, sizeof(void *), GFP_KERNEL); diff --git a/drivers/md/bcache/sysfs.c b/drivers/md/bcache/sysfs.c index 826b14cae4e5..e8f696cb58c0 100644 --- a/drivers/md/bcache/sysfs.c +++ b/drivers/md/bcache/sysfs.c @@ -660,7 +660,9 @@ static unsigned int bch_root_usage(struct cache_set *c) unsigned int bytes = 0; struct bkey *k; struct btree *b; - struct btree_iter_stack iter; + struct btree_iter iter; + + min_heap_init(&iter.heap, NULL, MAX_BSETS); goto lock_root; diff --git a/drivers/md/bcache/util.c b/drivers/md/bcache/util.c index ae380bc3992e..410d8cb49e50 100644 --- a/drivers/md/bcache/util.c +++ b/drivers/md/bcache/util.c @@ -1,6 +1,6 @@ // SPDX-License-Identifier: GPL-2.0 /* - * random utiility code, for bcache but in theory not specific to bcache + * random utility code, for bcache but in theory not specific to bcache * * Copyright 2010, 2011 Kent Overstreet <[email protected]> * Copyright 2012 Google, Inc. diff --git a/drivers/md/bcache/util.h b/drivers/md/bcache/util.h index f61ab1bada6c..539454d8e2d0 100644 --- a/drivers/md/bcache/util.h +++ b/drivers/md/bcache/util.h @@ -9,6 +9,7 @@ #include <linux/kernel.h> #include <linux/sched/clock.h> #include <linux/llist.h> +#include <linux/min_heap.h> #include <linux/ratelimit.h> #include <linux/vmalloc.h> #include <linux/workqueue.h> @@ -30,16 +31,10 @@ struct closure; #endif -#define DECLARE_HEAP(type, name) \ - struct { \ - size_t size, used; \ - type *data; \ - } name - #define init_heap(heap, _size, gfp) \ ({ \ size_t _bytes; \ - (heap)->used = 0; \ + (heap)->nr = 0; \ (heap)->size = (_size); \ _bytes = (heap)->size * sizeof(*(heap)->data); \ (heap)->data = kvmalloc(_bytes, (gfp) & GFP_KERNEL); \ @@ -52,64 +47,6 @@ do { \ (heap)->data = NULL; \ } while (0) -#define heap_swap(h, i, j) swap((h)->data[i], (h)->data[j]) - -#define heap_sift(h, i, cmp) \ -do { \ - size_t _r, _j = i; \ - \ - for (; _j * 2 + 1 < (h)->used; _j = _r) { \ - _r = _j * 2 + 1; \ - if (_r + 1 < (h)->used && \ - cmp((h)->data[_r], (h)->data[_r + 1])) \ - _r++; \ - \ - if (cmp((h)->data[_r], (h)->data[_j])) \ - break; \ - heap_swap(h, _r, _j); \ - } \ -} while (0) - -#define heap_sift_down(h, i, cmp) \ -do { \ - while (i) { \ - size_t p = (i - 1) / 2; \ - if (cmp((h)->data[i], (h)->data[p])) \ - break; \ - heap_swap(h, i, p); \ - i = p; \ - } \ -} while (0) - -#define heap_add(h, d, cmp) \ -({ \ - bool _r = !heap_full(h); \ - if (_r) { \ - size_t _i = (h)->used++; \ - (h)->data[_i] = d; \ - \ - heap_sift_down(h, _i, cmp); \ - heap_sift(h, _i, cmp); \ - } \ - _r; \ -}) - -#define heap_pop(h, d, cmp) \ -({ \ - bool _r = (h)->used; \ - if (_r) { \ - (d) = (h)->data[0]; \ - (h)->used--; \ - heap_swap(h, 0, (h)->used); \ - heap_sift(h, 0, cmp); \ - } \ - _r; \ -}) - -#define heap_peek(h) ((h)->used ? (h)->data[0] : NULL) - -#define heap_full(h) ((h)->used == (h)->size) - #define DECLARE_FIFO(type, name) \ struct { \ size_t front, back, size, mask; \ diff --git a/drivers/md/bcache/writeback.c b/drivers/md/bcache/writeback.c index 792e070ccf38..c1d28e365910 100644 --- a/drivers/md/bcache/writeback.c +++ b/drivers/md/bcache/writeback.c @@ -908,15 +908,16 @@ static int bch_dirty_init_thread(void *arg) struct dirty_init_thrd_info *info = arg; struct bch_dirty_init_state *state = info->state; struct cache_set *c = state->c; - struct btree_iter_stack iter; + struct btree_iter iter; struct bkey *k, *p; int cur_idx, prev_idx, skip_nr; k = p = NULL; prev_idx = 0; - bch_btree_iter_stack_init(&c->root->keys, &iter, NULL); - k = bch_btree_iter_next_filter(&iter.iter, &c->root->keys, bch_ptr_bad); + min_heap_init(&iter.heap, NULL, MAX_BSETS); + bch_btree_iter_init(&c->root->keys, &iter, NULL); + k = bch_btree_iter_next_filter(&iter, &c->root->keys, bch_ptr_bad); BUG_ON(!k); p = k; @@ -930,7 +931,7 @@ static int bch_dirty_init_thread(void *arg) skip_nr = cur_idx - prev_idx; while (skip_nr) { - k = bch_btree_iter_next_filter(&iter.iter, + k = bch_btree_iter_next_filter(&iter, &c->root->keys, bch_ptr_bad); if (k) @@ -979,11 +980,13 @@ void bch_sectors_dirty_init(struct bcache_device *d) int i; struct btree *b = NULL; struct bkey *k = NULL; - struct btree_iter_stack iter; + struct btree_iter iter; struct sectors_dirty_init op; struct cache_set *c = d->c; struct bch_dirty_init_state state; + min_heap_init(&iter.heap, NULL, MAX_BSETS); + retry_lock: b = c->root; rw_lock(0, b, b->level); diff --git a/drivers/md/dm-cache-metadata.c b/drivers/md/dm-cache-metadata.c index 96751cd3d181..24cd87fddf75 100644 --- a/drivers/md/dm-cache-metadata.c +++ b/drivers/md/dm-cache-metadata.c @@ -170,7 +170,7 @@ struct dm_cache_metadata { */ #define SUPERBLOCK_CSUM_XOR 9031977 -static void sb_prepare_for_write(struct dm_block_validator *v, +static void sb_prepare_for_write(const struct dm_block_validator *v, struct dm_block *b, size_t sb_block_size) { @@ -195,7 +195,7 @@ static int check_metadata_version(struct cache_disk_superblock *disk_super) return 0; } -static int sb_check(struct dm_block_validator *v, +static int sb_check(const struct dm_block_validator *v, struct dm_block *b, size_t sb_block_size) { @@ -228,7 +228,7 @@ static int sb_check(struct dm_block_validator *v, return check_metadata_version(disk_super); } -static struct dm_block_validator sb_validator = { +static const struct dm_block_validator sb_validator = { .name = "superblock", .prepare_for_write = sb_prepare_for_write, .check = sb_check @@ -1282,15 +1282,6 @@ int dm_cache_insert_mapping(struct dm_cache_metadata *cmd, return r; } -struct thunk { - load_mapping_fn fn; - void *context; - - struct dm_cache_metadata *cmd; - bool respect_dirty_flags; - bool hints_valid; -}; - static bool policy_unchanged(struct dm_cache_metadata *cmd, struct dm_cache_policy *policy) { diff --git a/drivers/md/dm-cache-target.c b/drivers/md/dm-cache-target.c index 16884b585053..17f0fab1e254 100644 --- a/drivers/md/dm-cache-target.c +++ b/drivers/md/dm-cache-target.c @@ -3403,7 +3403,6 @@ static void set_discard_limits(struct cache *cache, struct queue_limits *limits) limits->max_hw_discard_sectors = origin_limits->max_hw_discard_sectors; limits->discard_granularity = origin_limits->discard_granularity; limits->discard_alignment = origin_limits->discard_alignment; - limits->discard_misaligned = origin_limits->discard_misaligned; } static void cache_io_hints(struct dm_target *ti, struct queue_limits *limits) @@ -3417,8 +3416,8 @@ static void cache_io_hints(struct dm_target *ti, struct queue_limits *limits) */ if (io_opt_sectors < cache->sectors_per_block || do_div(io_opt_sectors, cache->sectors_per_block)) { - blk_limits_io_min(limits, cache->sectors_per_block << SECTOR_SHIFT); - blk_limits_io_opt(limits, cache->sectors_per_block << SECTOR_SHIFT); + limits->io_min = cache->sectors_per_block << SECTOR_SHIFT; + limits->io_opt = cache->sectors_per_block << SECTOR_SHIFT; } disable_passdown_if_not_supported(cache); diff --git a/drivers/md/dm-clone-metadata.c b/drivers/md/dm-clone-metadata.c index 47c1fa7aad8b..2db84cd2202b 100644 --- a/drivers/md/dm-clone-metadata.c +++ b/drivers/md/dm-clone-metadata.c @@ -163,7 +163,7 @@ struct dm_clone_metadata { /* * Superblock validation. */ -static void sb_prepare_for_write(struct dm_block_validator *v, +static void sb_prepare_for_write(const struct dm_block_validator *v, struct dm_block *b, size_t sb_block_size) { struct superblock_disk *sb; @@ -177,7 +177,7 @@ static void sb_prepare_for_write(struct dm_block_validator *v, sb->csum = cpu_to_le32(csum); } -static int sb_check(struct dm_block_validator *v, struct dm_block *b, +static int sb_check(const struct dm_block_validator *v, struct dm_block *b, size_t sb_block_size) { struct superblock_disk *sb; @@ -220,7 +220,7 @@ static int sb_check(struct dm_block_validator *v, struct dm_block *b, return 0; } -static struct dm_block_validator sb_validator = { +static const struct dm_block_validator sb_validator = { .name = "superblock", .prepare_for_write = sb_prepare_for_write, .check = sb_check diff --git a/drivers/md/dm-clone-target.c b/drivers/md/dm-clone-target.c index ad79b52ffc14..12bbe487a4c8 100644 --- a/drivers/md/dm-clone-target.c +++ b/drivers/md/dm-clone-target.c @@ -2059,7 +2059,6 @@ static void set_discard_limits(struct clone *clone, struct queue_limits *limits) limits->max_hw_discard_sectors = dest_limits->max_hw_discard_sectors; limits->discard_granularity = dest_limits->discard_granularity; limits->discard_alignment = dest_limits->discard_alignment; - limits->discard_misaligned = dest_limits->discard_misaligned; limits->max_discard_segments = dest_limits->max_discard_segments; } @@ -2074,8 +2073,8 @@ static void clone_io_hints(struct dm_target *ti, struct queue_limits *limits) */ if (io_opt_sectors < clone->region_size || do_div(io_opt_sectors, clone->region_size)) { - blk_limits_io_min(limits, clone->region_size << SECTOR_SHIFT); - blk_limits_io_opt(limits, clone->region_size << SECTOR_SHIFT); + limits->io_min = clone->region_size << SECTOR_SHIFT; + limits->io_opt = clone->region_size << SECTOR_SHIFT; } disable_passdown_if_not_supported(clone); diff --git a/drivers/md/dm-core.h b/drivers/md/dm-core.h index 08700bfc3e23..3637761f3585 100644 --- a/drivers/md/dm-core.h +++ b/drivers/md/dm-core.h @@ -206,7 +206,8 @@ struct dm_table { bool integrity_supported:1; bool singleton:1; - unsigned integrity_added:1; + /* set if all the targets in the table have "flush_bypasses_map" set */ + bool flush_bypasses_map:1; /* * Indicates the rw permissions for the new logical device. This diff --git a/drivers/md/dm-crypt.c b/drivers/md/dm-crypt.c index 1b7a97cc3779..348b4b26c272 100644 --- a/drivers/md/dm-crypt.c +++ b/drivers/md/dm-crypt.c @@ -214,7 +214,8 @@ struct crypt_config { unsigned int integrity_tag_size; unsigned int integrity_iv_size; - unsigned int on_disk_tag_size; + unsigned int used_tag_size; + unsigned int tuple_size; /* * pool for per bio private data, crypto requests, @@ -241,6 +242,31 @@ static unsigned int dm_crypt_clients_n; static volatile unsigned long dm_crypt_pages_per_client; #define DM_CRYPT_MEMORY_PERCENT 2 #define DM_CRYPT_MIN_PAGES_PER_CLIENT (BIO_MAX_VECS * 16) +#define DM_CRYPT_DEFAULT_MAX_READ_SIZE 131072 +#define DM_CRYPT_DEFAULT_MAX_WRITE_SIZE 131072 + +static unsigned int max_read_size = 0; +module_param(max_read_size, uint, 0644); +MODULE_PARM_DESC(max_read_size, "Maximum size of a read request"); +static unsigned int max_write_size = 0; +module_param(max_write_size, uint, 0644); +MODULE_PARM_DESC(max_write_size, "Maximum size of a write request"); +static unsigned get_max_request_size(struct crypt_config *cc, bool wrt) +{ + unsigned val, sector_align; + val = !wrt ? READ_ONCE(max_read_size) : READ_ONCE(max_write_size); + if (likely(!val)) + val = !wrt ? DM_CRYPT_DEFAULT_MAX_READ_SIZE : DM_CRYPT_DEFAULT_MAX_WRITE_SIZE; + if (wrt || cc->used_tag_size) { + if (unlikely(val > BIO_MAX_VECS << PAGE_SHIFT)) + val = BIO_MAX_VECS << PAGE_SHIFT; + } + sector_align = max(bdev_logical_block_size(cc->dev->bdev), (unsigned)cc->sector_size); + val = round_down(val, sector_align); + if (unlikely(!val)) + val = sector_align; + return val >> SECTOR_SHIFT; +} static void crypt_endio(struct bio *clone); static void kcryptd_queue_crypt(struct dm_crypt_io *io); @@ -1151,14 +1177,14 @@ static int dm_crypt_integrity_io_alloc(struct dm_crypt_io *io, struct bio *bio) unsigned int tag_len; int ret; - if (!bio_sectors(bio) || !io->cc->on_disk_tag_size) + if (!bio_sectors(bio) || !io->cc->tuple_size) return 0; bip = bio_integrity_alloc(bio, GFP_NOIO, 1); if (IS_ERR(bip)) return PTR_ERR(bip); - tag_len = io->cc->on_disk_tag_size * (bio_sectors(bio) >> io->cc->sector_shift); + tag_len = io->cc->tuple_size * (bio_sectors(bio) >> io->cc->sector_shift); bip->bip_iter.bi_sector = io->cc->start + io->sector; @@ -1176,24 +1202,24 @@ static int crypt_integrity_ctr(struct crypt_config *cc, struct dm_target *ti) struct blk_integrity *bi = blk_get_integrity(cc->dev->bdev->bd_disk); struct mapped_device *md = dm_table_get_md(ti->table); - /* From now we require underlying device with our integrity profile */ - if (!bi || strcasecmp(bi->profile->name, "DM-DIF-EXT-TAG")) { + /* We require an underlying device with non-PI metadata */ + if (!bi || bi->csum_type != BLK_INTEGRITY_CSUM_NONE) { ti->error = "Integrity profile not supported."; return -EINVAL; } - if (bi->tag_size != cc->on_disk_tag_size || - bi->tuple_size != cc->on_disk_tag_size) { + if (bi->tuple_size < cc->used_tag_size) { ti->error = "Integrity profile tag size mismatch."; return -EINVAL; } + cc->tuple_size = bi->tuple_size; if (1 << bi->interval_exp != cc->sector_size) { ti->error = "Integrity profile sector size mismatch."; return -EINVAL; } if (crypt_integrity_aead(cc)) { - cc->integrity_tag_size = cc->on_disk_tag_size - cc->integrity_iv_size; + cc->integrity_tag_size = cc->used_tag_size - cc->integrity_iv_size; DMDEBUG("%s: Integrity AEAD, tag size %u, IV size %u.", dm_device_name(md), cc->integrity_tag_size, cc->integrity_iv_size); @@ -1205,7 +1231,7 @@ static int crypt_integrity_ctr(struct crypt_config *cc, struct dm_target *ti) DMDEBUG("%s: Additional per-sector space %u bytes for IV.", dm_device_name(md), cc->integrity_iv_size); - if ((cc->integrity_tag_size + cc->integrity_iv_size) != bi->tag_size) { + if ((cc->integrity_tag_size + cc->integrity_iv_size) > cc->tuple_size) { ti->error = "Not enough space for integrity tag in the profile."; return -EINVAL; } @@ -1284,7 +1310,7 @@ static void *tag_from_dmreq(struct crypt_config *cc, struct dm_crypt_io *io = container_of(ctx, struct dm_crypt_io, ctx); return &io->integrity_metadata[*org_tag_of_dmreq(cc, dmreq) * - cc->on_disk_tag_size]; + cc->tuple_size]; } static void *iv_tag_from_dmreq(struct crypt_config *cc, @@ -1365,9 +1391,9 @@ static int crypt_convert_block_aead(struct crypt_config *cc, aead_request_set_crypt(req, dmreq->sg_in, dmreq->sg_out, cc->sector_size, iv); r = crypto_aead_encrypt(req); - if (cc->integrity_tag_size + cc->integrity_iv_size != cc->on_disk_tag_size) + if (cc->integrity_tag_size + cc->integrity_iv_size != cc->tuple_size) memset(tag + cc->integrity_tag_size + cc->integrity_iv_size, 0, - cc->on_disk_tag_size - (cc->integrity_tag_size + cc->integrity_iv_size)); + cc->tuple_size - (cc->integrity_tag_size + cc->integrity_iv_size)); } else { aead_request_set_crypt(req, dmreq->sg_in, dmreq->sg_out, cc->sector_size + cc->integrity_tag_size, iv); @@ -1797,7 +1823,7 @@ static void crypt_dec_pending(struct dm_crypt_io *io) return; if (likely(!io->ctx.aead_recheck) && unlikely(io->ctx.aead_failed) && - cc->on_disk_tag_size && bio_data_dir(base_bio) == READ) { + cc->used_tag_size && bio_data_dir(base_bio) == READ) { io->ctx.aead_recheck = true; io->ctx.aead_failed = false; io->error = 0; @@ -3181,7 +3207,7 @@ static int crypt_ctr_optional(struct dm_target *ti, unsigned int argc, char **ar ti->error = "Invalid integrity arguments"; return -EINVAL; } - cc->on_disk_tag_size = val; + cc->used_tag_size = val; sval = strchr(opt_string + strlen("integrity:"), ':') + 1; if (!strcasecmp(sval, "aead")) { set_bit(CRYPT_MODE_INTEGRITY_AEAD, &cc->cipher_flags); @@ -3393,12 +3419,12 @@ static int crypt_ctr(struct dm_target *ti, unsigned int argc, char **argv) if (ret) goto bad; - cc->tag_pool_max_sectors = POOL_ENTRY_SIZE / cc->on_disk_tag_size; + cc->tag_pool_max_sectors = POOL_ENTRY_SIZE / cc->tuple_size; if (!cc->tag_pool_max_sectors) cc->tag_pool_max_sectors = 1; ret = mempool_init_kmalloc_pool(&cc->tag_pool, MIN_IOS, - cc->tag_pool_max_sectors * cc->on_disk_tag_size); + cc->tag_pool_max_sectors * cc->tuple_size); if (ret) { ti->error = "Cannot allocate integrity tags mempool"; goto bad; @@ -3474,6 +3500,7 @@ static int crypt_map(struct dm_target *ti, struct bio *bio) { struct dm_crypt_io *io; struct crypt_config *cc = ti->private; + unsigned max_sectors; /* * If bio is REQ_PREFLUSH or REQ_OP_DISCARD, just bypass crypt queues. @@ -3492,9 +3519,9 @@ static int crypt_map(struct dm_target *ti, struct bio *bio) /* * Check if bio is too large, split as needed. */ - if (unlikely(bio->bi_iter.bi_size > (BIO_MAX_VECS << PAGE_SHIFT)) && - (bio_data_dir(bio) == WRITE || cc->on_disk_tag_size)) - dm_accept_partial_bio(bio, ((BIO_MAX_VECS << PAGE_SHIFT) >> SECTOR_SHIFT)); + max_sectors = get_max_request_size(cc, bio_data_dir(bio) == WRITE); + if (unlikely(bio_sectors(bio) > max_sectors)) + dm_accept_partial_bio(bio, max_sectors); /* * Ensure that bio is a multiple of internal sector encryption size @@ -3509,8 +3536,8 @@ static int crypt_map(struct dm_target *ti, struct bio *bio) io = dm_per_bio_data(bio, cc->per_bio_data_size); crypt_io_init(io, cc, bio, dm_target_offset(ti, bio->bi_iter.bi_sector)); - if (cc->on_disk_tag_size) { - unsigned int tag_len = cc->on_disk_tag_size * (bio_sectors(bio) >> cc->sector_shift); + if (cc->tuple_size) { + unsigned int tag_len = cc->tuple_size * (bio_sectors(bio) >> cc->sector_shift); if (unlikely(tag_len > KMALLOC_MAX_SIZE)) io->integrity_metadata = NULL; @@ -3582,7 +3609,7 @@ static void crypt_status(struct dm_target *ti, status_type_t type, num_feature_args += test_bit(DM_CRYPT_NO_WRITE_WORKQUEUE, &cc->flags); num_feature_args += cc->sector_size != (1 << SECTOR_SHIFT); num_feature_args += test_bit(CRYPT_IV_LARGE_SECTORS, &cc->cipher_flags); - if (cc->on_disk_tag_size) + if (cc->used_tag_size) num_feature_args++; if (num_feature_args) { DMEMIT(" %d", num_feature_args); @@ -3598,8 +3625,8 @@ static void crypt_status(struct dm_target *ti, status_type_t type, DMEMIT(" no_read_workqueue"); if (test_bit(DM_CRYPT_NO_WRITE_WORKQUEUE, &cc->flags)) DMEMIT(" no_write_workqueue"); - if (cc->on_disk_tag_size) - DMEMIT(" integrity:%u:%s", cc->on_disk_tag_size, cc->cipher_auth); + if (cc->used_tag_size) + DMEMIT(" integrity:%u:%s", cc->used_tag_size, cc->cipher_auth); if (cc->sector_size != (1 << SECTOR_SHIFT)) DMEMIT(" sector_size:%d", cc->sector_size); if (test_bit(CRYPT_IV_LARGE_SECTORS, &cc->cipher_flags)) @@ -3621,9 +3648,9 @@ static void crypt_status(struct dm_target *ti, status_type_t type, DMEMIT(",iv_large_sectors=%c", test_bit(CRYPT_IV_LARGE_SECTORS, &cc->cipher_flags) ? 'y' : 'n'); - if (cc->on_disk_tag_size) + if (cc->used_tag_size) DMEMIT(",integrity_tag_size=%u,cipher_auth=%s", - cc->on_disk_tag_size, cc->cipher_auth); + cc->used_tag_size, cc->cipher_auth); if (cc->sector_size != (1 << SECTOR_SHIFT)) DMEMIT(",sector_size=%d", cc->sector_size); if (cc->cipher_string) @@ -3731,7 +3758,7 @@ static void crypt_io_hints(struct dm_target *ti, struct queue_limits *limits) static struct target_type crypt_target = { .name = "crypt", - .version = {1, 26, 0}, + .version = {1, 27, 0}, .module = THIS_MODULE, .ctr = crypt_ctr, .dtr = crypt_dtr, diff --git a/drivers/md/dm-ebs-target.c b/drivers/md/dm-ebs-target.c index b70d4016c2ac..ec5db1478b2f 100644 --- a/drivers/md/dm-ebs-target.c +++ b/drivers/md/dm-ebs-target.c @@ -428,7 +428,7 @@ static void ebs_io_hints(struct dm_target *ti, struct queue_limits *limits) limits->logical_block_size = to_bytes(ec->e_bs); limits->physical_block_size = to_bytes(ec->u_bs); limits->alignment_offset = limits->physical_block_size; - blk_limits_io_min(limits, limits->logical_block_size); + limits->io_min = limits->logical_block_size; } static int ebs_iterate_devices(struct dm_target *ti, diff --git a/drivers/md/dm-era-target.c b/drivers/md/dm-era-target.c index 8f81e597858d..9c84e9d13eca 100644 --- a/drivers/md/dm-era-target.c +++ b/drivers/md/dm-era-target.c @@ -196,7 +196,7 @@ struct superblock_disk { * Superblock validation *-------------------------------------------------------------- */ -static void sb_prepare_for_write(struct dm_block_validator *v, +static void sb_prepare_for_write(const struct dm_block_validator *v, struct dm_block *b, size_t sb_block_size) { @@ -221,7 +221,7 @@ static int check_metadata_version(struct superblock_disk *disk) return 0; } -static int sb_check(struct dm_block_validator *v, +static int sb_check(const struct dm_block_validator *v, struct dm_block *b, size_t sb_block_size) { @@ -254,7 +254,7 @@ static int sb_check(struct dm_block_validator *v, return check_metadata_version(disk); } -static struct dm_block_validator sb_validator = { +static const struct dm_block_validator sb_validator = { .name = "superblock", .prepare_for_write = sb_prepare_for_write, .check = sb_check @@ -1733,8 +1733,8 @@ static void era_io_hints(struct dm_target *ti, struct queue_limits *limits) */ if (io_opt_sectors < era->sectors_per_block || do_div(io_opt_sectors, era->sectors_per_block)) { - blk_limits_io_min(limits, 0); - blk_limits_io_opt(limits, era->sectors_per_block << SECTOR_SHIFT); + limits->io_min = 0; + limits->io_opt = era->sectors_per_block << SECTOR_SHIFT; } } diff --git a/drivers/md/dm-init.c b/drivers/md/dm-init.c index 2a71bcdba92d..b37bbe762500 100644 --- a/drivers/md/dm-init.c +++ b/drivers/md/dm-init.c @@ -212,8 +212,10 @@ static char __init *dm_parse_device_entry(struct dm_device *dev, char *str) strscpy(dev->dmi.uuid, field[1], sizeof(dev->dmi.uuid)); /* minor */ if (strlen(field[2])) { - if (kstrtoull(field[2], 0, &dev->dmi.dev)) + if (kstrtoull(field[2], 0, &dev->dmi.dev) || + dev->dmi.dev >= (1 << MINORBITS)) return ERR_PTR(-EINVAL); + dev->dmi.dev = huge_encode_dev((dev_t)dev->dmi.dev); dev->dmi.flags |= DM_PERSISTENT_DEV_FLAG; } /* flags */ diff --git a/drivers/md/dm-integrity.c b/drivers/md/dm-integrity.c index 417fddebe367..48ac628f471b 100644 --- a/drivers/md/dm-integrity.c +++ b/drivers/md/dm-integrity.c @@ -44,6 +44,7 @@ #define BITMAP_FLUSH_INTERVAL (10 * HZ) #define DISCARD_FILLER 0xf6 #define SALT_SIZE 16 +#define RECHECK_POOL_SIZE 256 /* * Warning - DEBUG_PRINT prints security-sensitive data to the log, @@ -62,6 +63,7 @@ #define SB_VERSION_3 3 #define SB_VERSION_4 4 #define SB_VERSION_5 5 +#define SB_VERSION_6 6 #define SB_SECTORS 8 #define MAX_SECTORS_PER_BLOCK 8 @@ -86,6 +88,7 @@ struct superblock { #define SB_FLAG_DIRTY_BITMAP 0x4 #define SB_FLAG_FIXED_PADDING 0x8 #define SB_FLAG_FIXED_HMAC 0x10 +#define SB_FLAG_INLINE 0x20 #define JOURNAL_ENTRY_ROUNDUP 8 @@ -166,6 +169,7 @@ struct dm_integrity_c { struct dm_dev *meta_dev; unsigned int tag_size; __s8 log2_tag_size; + unsigned int tuple_size; sector_t start; mempool_t journal_io_mempool; struct dm_io_client *io; @@ -279,6 +283,7 @@ struct dm_integrity_c { atomic64_t number_of_mismatches; mempool_t recheck_pool; + struct bio_set recheck_bios; struct notifier_block reboot_notifier; }; @@ -314,6 +319,9 @@ struct dm_integrity_io { struct completion *completion; struct dm_bio_details bio_details; + + char *integrity_payload; + bool integrity_payload_from_mempool; }; struct journal_completion { @@ -350,26 +358,8 @@ static struct kmem_cache *journal_io_cache; #define DEBUG_bytes(bytes, len, msg, ...) do { } while (0) #endif -static void dm_integrity_prepare(struct request *rq) -{ -} - -static void dm_integrity_complete(struct request *rq, unsigned int nr_bytes) -{ -} - -/* - * DM Integrity profile, protection is performed layer above (dm-crypt) - */ -static const struct blk_integrity_profile dm_integrity_profile = { - .name = "DM-DIF-EXT-TAG", - .generate_fn = NULL, - .verify_fn = NULL, - .prepare_fn = dm_integrity_prepare, - .complete_fn = dm_integrity_complete, -}; - static void dm_integrity_map_continue(struct dm_integrity_io *dio, bool from_map); +static int dm_integrity_map_inline(struct dm_integrity_io *dio); static void integrity_bio_wait(struct work_struct *w); static void dm_integrity_dtr(struct dm_target *ti); @@ -479,7 +469,9 @@ static void wraparound_section(struct dm_integrity_c *ic, unsigned int *sec_ptr) static void sb_set_version(struct dm_integrity_c *ic) { - if (ic->sb->flags & cpu_to_le32(SB_FLAG_FIXED_HMAC)) + if (ic->sb->flags & cpu_to_le32(SB_FLAG_INLINE)) + ic->sb->version = SB_VERSION_6; + else if (ic->sb->flags & cpu_to_le32(SB_FLAG_FIXED_HMAC)) ic->sb->version = SB_VERSION_5; else if (ic->sb->flags & cpu_to_le32(SB_FLAG_FIXED_PADDING)) ic->sb->version = SB_VERSION_4; @@ -1913,6 +1905,35 @@ error: dec_in_flight(dio); } +static inline bool dm_integrity_check_limits(struct dm_integrity_c *ic, sector_t logical_sector, struct bio *bio) +{ + if (unlikely(logical_sector + bio_sectors(bio) > ic->provided_data_sectors)) { + DMERR("Too big sector number: 0x%llx + 0x%x > 0x%llx", + logical_sector, bio_sectors(bio), + ic->provided_data_sectors); + return false; + } + if (unlikely((logical_sector | bio_sectors(bio)) & (unsigned int)(ic->sectors_per_block - 1))) { + DMERR("Bio not aligned on %u sectors: 0x%llx, 0x%x", + ic->sectors_per_block, + logical_sector, bio_sectors(bio)); + return false; + } + if (ic->sectors_per_block > 1 && likely(bio_op(bio) != REQ_OP_DISCARD)) { + struct bvec_iter iter; + struct bio_vec bv; + + bio_for_each_segment(bv, bio, iter) { + if (unlikely(bv.bv_len & ((ic->sectors_per_block << SECTOR_SHIFT) - 1))) { + DMERR("Bio vector (%u,%u) is not aligned on %u-sector boundary", + bv.bv_offset, bv.bv_len, ic->sectors_per_block); + return false; + } + } + } + return true; +} + static int dm_integrity_map(struct dm_target *ti, struct bio *bio) { struct dm_integrity_c *ic = ti->private; @@ -1925,6 +1946,9 @@ static int dm_integrity_map(struct dm_target *ti, struct bio *bio) dio->bi_status = 0; dio->op = bio_op(bio); + if (ic->mode == 'I') + return dm_integrity_map_inline(dio); + if (unlikely(dio->op == REQ_OP_DISCARD)) { if (ti->max_io_len) { sector_t sec = dm_target_offset(ti, bio->bi_iter.bi_sector); @@ -1954,31 +1978,8 @@ static int dm_integrity_map(struct dm_target *ti, struct bio *bio) */ bio->bi_opf &= ~REQ_FUA; } - if (unlikely(dio->range.logical_sector + bio_sectors(bio) > ic->provided_data_sectors)) { - DMERR("Too big sector number: 0x%llx + 0x%x > 0x%llx", - dio->range.logical_sector, bio_sectors(bio), - ic->provided_data_sectors); + if (unlikely(!dm_integrity_check_limits(ic, dio->range.logical_sector, bio))) return DM_MAPIO_KILL; - } - if (unlikely((dio->range.logical_sector | bio_sectors(bio)) & (unsigned int)(ic->sectors_per_block - 1))) { - DMERR("Bio not aligned on %u sectors: 0x%llx, 0x%x", - ic->sectors_per_block, - dio->range.logical_sector, bio_sectors(bio)); - return DM_MAPIO_KILL; - } - - if (ic->sectors_per_block > 1 && likely(dio->op != REQ_OP_DISCARD)) { - struct bvec_iter iter; - struct bio_vec bv; - - bio_for_each_segment(bv, bio, iter) { - if (unlikely(bv.bv_len & ((ic->sectors_per_block << SECTOR_SHIFT) - 1))) { - DMERR("Bio vector (%u,%u) is not aligned on %u-sector boundary", - bv.bv_offset, bv.bv_len, ic->sectors_per_block); - return DM_MAPIO_KILL; - } - } - } bip = bio_integrity(bio); if (!ic->internal_hash) { @@ -2394,6 +2395,213 @@ journal_read_write: do_endio_flush(ic, dio); } +static int dm_integrity_map_inline(struct dm_integrity_io *dio) +{ + struct dm_integrity_c *ic = dio->ic; + struct bio *bio = dm_bio_from_per_bio_data(dio, sizeof(struct dm_integrity_io)); + struct bio_integrity_payload *bip; + unsigned payload_len, digest_size, extra_size, ret; + + dio->integrity_payload = NULL; + dio->integrity_payload_from_mempool = false; + + if (unlikely(bio_integrity(bio))) { + bio->bi_status = BLK_STS_NOTSUPP; + bio_endio(bio); + return DM_MAPIO_SUBMITTED; + } + + bio_set_dev(bio, ic->dev->bdev); + if (unlikely((bio->bi_opf & REQ_PREFLUSH) != 0)) + return DM_MAPIO_REMAPPED; + +retry: + payload_len = ic->tuple_size * (bio_sectors(bio) >> ic->sb->log2_sectors_per_block); + digest_size = crypto_shash_digestsize(ic->internal_hash); + extra_size = unlikely(digest_size > ic->tag_size) ? digest_size - ic->tag_size : 0; + payload_len += extra_size; + dio->integrity_payload = kmalloc(payload_len, GFP_NOIO | __GFP_NORETRY | __GFP_NOMEMALLOC | __GFP_NOWARN); + if (unlikely(!dio->integrity_payload)) { + const unsigned x_size = PAGE_SIZE << 1; + if (payload_len > x_size) { + unsigned sectors = ((x_size - extra_size) / ic->tuple_size) << ic->sb->log2_sectors_per_block; + if (WARN_ON(!sectors || sectors >= bio_sectors(bio))) { + bio->bi_status = BLK_STS_NOTSUPP; + bio_endio(bio); + return DM_MAPIO_SUBMITTED; + } + dm_accept_partial_bio(bio, sectors); + goto retry; + } + dio->integrity_payload = page_to_virt((struct page *)mempool_alloc(&ic->recheck_pool, GFP_NOIO)); + dio->integrity_payload_from_mempool = true; + } + + bio->bi_iter.bi_sector = dm_target_offset(ic->ti, bio->bi_iter.bi_sector); + dio->bio_details.bi_iter = bio->bi_iter; + + if (unlikely(!dm_integrity_check_limits(ic, bio->bi_iter.bi_sector, bio))) { + return DM_MAPIO_KILL; + } + + bio->bi_iter.bi_sector += ic->start + SB_SECTORS; + + bip = bio_integrity_alloc(bio, GFP_NOIO, 1); + if (unlikely(IS_ERR(bip))) { + bio->bi_status = errno_to_blk_status(PTR_ERR(bip)); + bio_endio(bio); + return DM_MAPIO_SUBMITTED; + } + + if (dio->op == REQ_OP_WRITE) { + unsigned pos = 0; + while (dio->bio_details.bi_iter.bi_size) { + struct bio_vec bv = bio_iter_iovec(bio, dio->bio_details.bi_iter); + const char *mem = bvec_kmap_local(&bv); + if (ic->tag_size < ic->tuple_size) + memset(dio->integrity_payload + pos + ic->tag_size, 0, ic->tuple_size - ic->tuple_size); + integrity_sector_checksum(ic, dio->bio_details.bi_iter.bi_sector, mem, dio->integrity_payload + pos); + kunmap_local(mem); + pos += ic->tuple_size; + bio_advance_iter_single(bio, &dio->bio_details.bi_iter, ic->sectors_per_block << SECTOR_SHIFT); + } + } + + ret = bio_integrity_add_page(bio, virt_to_page(dio->integrity_payload), + payload_len, offset_in_page(dio->integrity_payload)); + if (unlikely(ret != payload_len)) { + bio->bi_status = BLK_STS_RESOURCE; + bio_endio(bio); + return DM_MAPIO_SUBMITTED; + } + + return DM_MAPIO_REMAPPED; +} + +static inline void dm_integrity_free_payload(struct dm_integrity_io *dio) +{ + struct dm_integrity_c *ic = dio->ic; + if (unlikely(dio->integrity_payload_from_mempool)) + mempool_free(virt_to_page(dio->integrity_payload), &ic->recheck_pool); + else + kfree(dio->integrity_payload); + dio->integrity_payload = NULL; + dio->integrity_payload_from_mempool = false; +} + +static void dm_integrity_inline_recheck(struct work_struct *w) +{ + struct dm_integrity_io *dio = container_of(w, struct dm_integrity_io, work); + struct bio *bio = dm_bio_from_per_bio_data(dio, sizeof(struct dm_integrity_io)); + struct dm_integrity_c *ic = dio->ic; + struct bio *outgoing_bio; + void *outgoing_data; + + dio->integrity_payload = page_to_virt((struct page *)mempool_alloc(&ic->recheck_pool, GFP_NOIO)); + dio->integrity_payload_from_mempool = true; + + outgoing_data = dio->integrity_payload + PAGE_SIZE; + + while (dio->bio_details.bi_iter.bi_size) { + char digest[HASH_MAX_DIGESTSIZE]; + int r; + struct bio_integrity_payload *bip; + struct bio_vec bv; + char *mem; + + outgoing_bio = bio_alloc_bioset(ic->dev->bdev, 1, REQ_OP_READ, GFP_NOIO, &ic->recheck_bios); + + r = bio_add_page(outgoing_bio, virt_to_page(outgoing_data), ic->sectors_per_block << SECTOR_SHIFT, 0); + if (unlikely(r != (ic->sectors_per_block << SECTOR_SHIFT))) { + bio_put(outgoing_bio); + bio->bi_status = BLK_STS_RESOURCE; + bio_endio(bio); + return; + } + + bip = bio_integrity_alloc(outgoing_bio, GFP_NOIO, 1); + if (unlikely(IS_ERR(bip))) { + bio_put(outgoing_bio); + bio->bi_status = errno_to_blk_status(PTR_ERR(bip)); + bio_endio(bio); + return; + } + + r = bio_integrity_add_page(outgoing_bio, virt_to_page(dio->integrity_payload), ic->tuple_size, 0); + if (unlikely(r != ic->tuple_size)) { + bio_put(outgoing_bio); + bio->bi_status = BLK_STS_RESOURCE; + bio_endio(bio); + return; + } + + outgoing_bio->bi_iter.bi_sector = dio->bio_details.bi_iter.bi_sector + ic->start + SB_SECTORS; + + r = submit_bio_wait(outgoing_bio); + if (unlikely(r != 0)) { + bio_put(outgoing_bio); + bio->bi_status = errno_to_blk_status(r); + bio_endio(bio); + return; + } + bio_put(outgoing_bio); + + integrity_sector_checksum(ic, dio->bio_details.bi_iter.bi_sector, outgoing_data, digest); + if (unlikely(memcmp(digest, dio->integrity_payload, min(crypto_shash_digestsize(ic->internal_hash), ic->tag_size)))) { + DMERR_LIMIT("%pg: Checksum failed at sector 0x%llx", + ic->dev->bdev, dio->bio_details.bi_iter.bi_sector); + atomic64_inc(&ic->number_of_mismatches); + dm_audit_log_bio(DM_MSG_PREFIX, "integrity-checksum", + bio, dio->bio_details.bi_iter.bi_sector, 0); + + bio->bi_status = BLK_STS_PROTECTION; + bio_endio(bio); + return; + } + + bv = bio_iter_iovec(bio, dio->bio_details.bi_iter); + mem = bvec_kmap_local(&bv); + memcpy(mem, outgoing_data, ic->sectors_per_block << SECTOR_SHIFT); + kunmap_local(mem); + + bio_advance_iter_single(bio, &dio->bio_details.bi_iter, ic->sectors_per_block << SECTOR_SHIFT); + } + + bio_endio(bio); +} + +static int dm_integrity_end_io(struct dm_target *ti, struct bio *bio, blk_status_t *status) +{ + struct dm_integrity_c *ic = ti->private; + if (ic->mode == 'I') { + struct dm_integrity_io *dio = dm_per_bio_data(bio, sizeof(struct dm_integrity_io)); + if (dio->op == REQ_OP_READ && likely(*status == BLK_STS_OK)) { + unsigned pos = 0; + while (dio->bio_details.bi_iter.bi_size) { + char digest[HASH_MAX_DIGESTSIZE]; + struct bio_vec bv = bio_iter_iovec(bio, dio->bio_details.bi_iter); + char *mem = bvec_kmap_local(&bv); + //memset(mem, 0xff, ic->sectors_per_block << SECTOR_SHIFT); + integrity_sector_checksum(ic, dio->bio_details.bi_iter.bi_sector, mem, digest); + if (unlikely(memcmp(digest, dio->integrity_payload + pos, + min(crypto_shash_digestsize(ic->internal_hash), ic->tag_size)))) { + kunmap_local(mem); + dm_integrity_free_payload(dio); + INIT_WORK(&dio->work, dm_integrity_inline_recheck); + queue_work(ic->offload_wq, &dio->work); + return DM_ENDIO_INCOMPLETE; + } + kunmap_local(mem); + pos += ic->tuple_size; + bio_advance_iter_single(bio, &dio->bio_details.bi_iter, ic->sectors_per_block << SECTOR_SHIFT); + } + } + if (likely(dio->op == REQ_OP_READ) || likely(dio->op == REQ_OP_WRITE)) { + dm_integrity_free_payload(dio); + } + } + return DM_ENDIO_DONE; +} static void integrity_bio_wait(struct work_struct *w) { @@ -2432,6 +2640,9 @@ static void integrity_commit(struct work_struct *w) del_timer(&ic->autocommit_timer); + if (ic->mode == 'I') + return; + spin_lock_irq(&ic->endio_wait.lock); flushes = bio_list_get(&ic->flush_bio_list); if (unlikely(ic->mode != 'J')) { @@ -3490,10 +3701,21 @@ static void dm_integrity_io_hints(struct dm_target *ti, struct queue_limits *lim if (ic->sectors_per_block > 1) { limits->logical_block_size = ic->sectors_per_block << SECTOR_SHIFT; limits->physical_block_size = ic->sectors_per_block << SECTOR_SHIFT; - blk_limits_io_min(limits, ic->sectors_per_block << SECTOR_SHIFT); + limits->io_min = ic->sectors_per_block << SECTOR_SHIFT; limits->dma_alignment = limits->logical_block_size - 1; limits->discard_granularity = ic->sectors_per_block << SECTOR_SHIFT; } + + if (!ic->internal_hash) { + struct blk_integrity *bi = &limits->integrity; + + memset(bi, 0, sizeof(*bi)); + bi->tuple_size = ic->tag_size; + bi->tag_size = bi->tuple_size; + bi->interval_exp = + ic->sb->log2_sectors_per_block + SECTOR_SHIFT; + } + limits->max_integrity_segments = USHRT_MAX; } @@ -3523,7 +3745,10 @@ static int calculate_device_limits(struct dm_integrity_c *ic) return -EINVAL; ic->initial_sectors = initial_sectors; - if (!ic->meta_dev) { + if (ic->mode == 'I') { + if (ic->initial_sectors + ic->provided_data_sectors > ic->meta_device_sectors) + return -EINVAL; + } else if (!ic->meta_dev) { sector_t last_sector, last_area, last_offset; /* we have to maintain excessive padding for compatibility with existing volumes */ @@ -3586,6 +3811,8 @@ static int initialize_superblock(struct dm_integrity_c *ic, memset(ic->sb, 0, SB_SECTORS << SECTOR_SHIFT); memcpy(ic->sb->magic, SB_MAGIC, 8); + if (ic->mode == 'I') + ic->sb->flags |= cpu_to_le32(SB_FLAG_INLINE); ic->sb->integrity_tag_size = cpu_to_le16(ic->tag_size); ic->sb->log2_sectors_per_block = __ffs(ic->sectors_per_block); if (ic->journal_mac_alg.alg_string) @@ -3595,6 +3822,8 @@ static int initialize_superblock(struct dm_integrity_c *ic, journal_sections = journal_sectors / ic->journal_section_sectors; if (!journal_sections) journal_sections = 1; + if (ic->mode == 'I') + journal_sections = 0; if (ic->fix_hmac && (ic->internal_hash_alg.alg_string || ic->journal_mac_alg.alg_string)) { ic->sb->flags |= cpu_to_le32(SB_FLAG_FIXED_HMAC); @@ -3650,20 +3879,6 @@ try_smaller_buffer: return 0; } -static void dm_integrity_set(struct dm_target *ti, struct dm_integrity_c *ic) -{ - struct gendisk *disk = dm_disk(dm_table_get_md(ti->table)); - struct blk_integrity bi; - - memset(&bi, 0, sizeof(bi)); - bi.profile = &dm_integrity_profile; - bi.tuple_size = ic->tag_size; - bi.tag_size = bi.tuple_size; - bi.interval_exp = ic->sb->log2_sectors_per_block + SECTOR_SHIFT; - - blk_integrity_register(disk, &bi); -} - static void dm_integrity_free_page_list(struct page_list *pl) { unsigned int i; @@ -4157,10 +4372,11 @@ static int dm_integrity_ctr(struct dm_target *ti, unsigned int argc, char **argv } if (!strcmp(argv[3], "J") || !strcmp(argv[3], "B") || - !strcmp(argv[3], "D") || !strcmp(argv[3], "R")) { + !strcmp(argv[3], "D") || !strcmp(argv[3], "R") || + !strcmp(argv[3], "I")) { ic->mode = argv[3][0]; } else { - ti->error = "Invalid mode (expecting J, B, D, R)"; + ti->error = "Invalid mode (expecting J, B, D, R, I)"; r = -EINVAL; goto bad; } @@ -4306,6 +4522,53 @@ static int dm_integrity_ctr(struct dm_target *ti, unsigned int argc, char **argv else ic->log2_tag_size = -1; + if (ic->mode == 'I') { + struct blk_integrity *bi; + if (ic->meta_dev) { + r = -EINVAL; + ti->error = "Metadata device not supported in inline mode"; + goto bad; + } + if (!ic->internal_hash_alg.alg_string) { + r = -EINVAL; + ti->error = "Internal hash not set in inline mode"; + goto bad; + } + if (ic->journal_crypt_alg.alg_string || ic->journal_mac_alg.alg_string) { + r = -EINVAL; + ti->error = "Journal crypt not supported in inline mode"; + goto bad; + } + if (ic->discard) { + r = -EINVAL; + ti->error = "Discards not supported in inline mode"; + goto bad; + } + bi = blk_get_integrity(ic->dev->bdev->bd_disk); + if (!bi || bi->csum_type != BLK_INTEGRITY_CSUM_NONE) { + r = -EINVAL; + ti->error = "Integrity profile not supported"; + goto bad; + } + /*printk("tag_size: %u, tuple_size: %u\n", bi->tag_size, bi->tuple_size);*/ + if (bi->tuple_size < ic->tag_size) { + r = -EINVAL; + ti->error = "The integrity profile is smaller than tag size"; + goto bad; + } + if ((unsigned long)bi->tuple_size > PAGE_SIZE / 2) { + r = -EINVAL; + ti->error = "Too big tuple size"; + goto bad; + } + ic->tuple_size = bi->tuple_size; + if (1 << bi->interval_exp != ic->sectors_per_block << SECTOR_SHIFT) { + r = -EINVAL; + ti->error = "Integrity profile sector size mismatch"; + goto bad; + } + } + if (ic->mode == 'B' && !ic->internal_hash) { r = -EINVAL; ti->error = "Bitmap mode can be only used with internal hash"; @@ -4336,12 +4599,26 @@ static int dm_integrity_ctr(struct dm_target *ti, unsigned int argc, char **argv goto bad; } - r = mempool_init_page_pool(&ic->recheck_pool, 1, 0); + r = mempool_init_page_pool(&ic->recheck_pool, 1, ic->mode == 'I' ? 1 : 0); if (r) { ti->error = "Cannot allocate mempool"; goto bad; } + if (ic->mode == 'I') { + r = bioset_init(&ic->recheck_bios, RECHECK_POOL_SIZE, 0, BIOSET_NEED_BVECS); + if (r) { + ti->error = "Cannot allocate bio set"; + goto bad; + } + r = bioset_integrity_create(&ic->recheck_bios, RECHECK_POOL_SIZE); + if (r) { + ti->error = "Cannot allocate bio integrity set"; + r = -ENOMEM; + goto bad; + } + } + ic->metadata_wq = alloc_workqueue("dm-integrity-metadata", WQ_MEM_RECLAIM, METADATA_WORKQUEUE_MAX_ACTIVE); if (!ic->metadata_wq) { @@ -4418,11 +4695,16 @@ static int dm_integrity_ctr(struct dm_target *ti, unsigned int argc, char **argv should_write_sb = true; } - if (!ic->sb->version || ic->sb->version > SB_VERSION_5) { + if (!ic->sb->version || ic->sb->version > SB_VERSION_6) { r = -EINVAL; ti->error = "Unknown version"; goto bad; } + if (!!(ic->sb->flags & cpu_to_le32(SB_FLAG_INLINE)) != (ic->mode == 'I')) { + r = -EINVAL; + ti->error = "Inline flag mismatch"; + goto bad; + } if (le16_to_cpu(ic->sb->integrity_tag_size) != ic->tag_size) { r = -EINVAL; ti->error = "Tag size doesn't match the information in superblock"; @@ -4433,9 +4715,12 @@ static int dm_integrity_ctr(struct dm_target *ti, unsigned int argc, char **argv ti->error = "Block size doesn't match the information in superblock"; goto bad; } - if (!le32_to_cpu(ic->sb->journal_sections)) { + if (!le32_to_cpu(ic->sb->journal_sections) != (ic->mode == 'I')) { r = -EINVAL; - ti->error = "Corrupted superblock, journal_sections is 0"; + if (ic->mode != 'I') + ti->error = "Corrupted superblock, journal_sections is 0"; + else + ti->error = "Corrupted superblock, journal_sections is not 0"; goto bad; } /* make sure that ti->max_io_len doesn't overflow */ @@ -4487,8 +4772,9 @@ try_smaller_buffer: bits_in_journal = ((__u64)ic->journal_section_sectors * ic->journal_sections) << (SECTOR_SHIFT + 3); if (bits_in_journal > UINT_MAX) bits_in_journal = UINT_MAX; - while (bits_in_journal < (ic->provided_data_sectors + ((sector_t)1 << log2_sectors_per_bitmap_bit) - 1) >> log2_sectors_per_bitmap_bit) - log2_sectors_per_bitmap_bit++; + if (bits_in_journal) + while (bits_in_journal < (ic->provided_data_sectors + ((sector_t)1 << log2_sectors_per_bitmap_bit) - 1) >> log2_sectors_per_bitmap_bit) + log2_sectors_per_bitmap_bit++; log2_blocks_per_bitmap_bit = log2_sectors_per_bitmap_bit - ic->sb->log2_sectors_per_block; ic->log2_blocks_per_bitmap_bit = log2_blocks_per_bitmap_bit; @@ -4508,7 +4794,6 @@ try_smaller_buffer: goto bad; } - threshold = (__u64)ic->journal_entries * (100 - journal_watermark); threshold += 50; do_div(threshold, 100); @@ -4560,17 +4845,19 @@ try_smaller_buffer: goto bad; } - ic->bufio = dm_bufio_client_create(ic->meta_dev ? ic->meta_dev->bdev : ic->dev->bdev, - 1U << (SECTOR_SHIFT + ic->log2_buffer_sectors), 1, 0, NULL, NULL, 0); - if (IS_ERR(ic->bufio)) { - r = PTR_ERR(ic->bufio); - ti->error = "Cannot initialize dm-bufio"; - ic->bufio = NULL; - goto bad; + if (ic->mode != 'I') { + ic->bufio = dm_bufio_client_create(ic->meta_dev ? ic->meta_dev->bdev : ic->dev->bdev, + 1U << (SECTOR_SHIFT + ic->log2_buffer_sectors), 1, 0, NULL, NULL, 0); + if (IS_ERR(ic->bufio)) { + r = PTR_ERR(ic->bufio); + ti->error = "Cannot initialize dm-bufio"; + ic->bufio = NULL; + goto bad; + } + dm_bufio_set_sector_offset(ic->bufio, ic->start + ic->initial_sectors); } - dm_bufio_set_sector_offset(ic->bufio, ic->start + ic->initial_sectors); - if (ic->mode != 'R') { + if (ic->mode != 'R' && ic->mode != 'I') { r = create_journal(ic, &ti->error); if (r) goto bad; @@ -4630,7 +4917,7 @@ try_smaller_buffer: ic->just_formatted = true; } - if (!ic->meta_dev) { + if (!ic->meta_dev && ic->mode != 'I') { r = dm_set_target_max_io_len(ti, 1U << ic->sb->log2_interleave_sectors); if (r) goto bad; @@ -4649,14 +4936,14 @@ try_smaller_buffer: } } - if (!ic->internal_hash) - dm_integrity_set(ti, ic); - ti->num_flush_bios = 1; ti->flush_supported = true; if (ic->discard) ti->num_discard_bios = 1; + if (ic->mode == 'I') + ti->mempool_needs_integrity = true; + dm_audit_log_ctr(DM_MSG_PREFIX, ti, 1); return 0; @@ -4690,6 +4977,7 @@ static void dm_integrity_dtr(struct dm_target *ti) kvfree(ic->bbs); if (ic->bufio) dm_bufio_client_destroy(ic->bufio); + bioset_exit(&ic->recheck_bios); mempool_exit(&ic->recheck_pool); mempool_exit(&ic->journal_io_mempool); if (ic->io) @@ -4743,12 +5031,13 @@ static void dm_integrity_dtr(struct dm_target *ti) static struct target_type integrity_target = { .name = "integrity", - .version = {1, 11, 0}, + .version = {1, 12, 0}, .module = THIS_MODULE, .features = DM_TARGET_SINGLETON | DM_TARGET_INTEGRITY, .ctr = dm_integrity_ctr, .dtr = dm_integrity_dtr, .map = dm_integrity_map, + .end_io = dm_integrity_end_io, .postsuspend = dm_integrity_postsuspend, .resume = dm_integrity_resume, .status = dm_integrity_status, diff --git a/drivers/md/dm-io.c b/drivers/md/dm-io.c index 7409490259d1..d7a8e2f40db3 100644 --- a/drivers/md/dm-io.c +++ b/drivers/md/dm-io.c @@ -347,7 +347,7 @@ static void do_region(const blk_opf_t opf, unsigned int region, break; default: num_bvecs = bio_max_segs(dm_sector_div_up(remaining, - (PAGE_SIZE >> SECTOR_SHIFT))); + (PAGE_SIZE >> SECTOR_SHIFT)) + 1); } bio = bio_alloc_bioset(where->bdev, num_bvecs, opf, GFP_NOIO, @@ -384,16 +384,13 @@ static void do_region(const blk_opf_t opf, unsigned int region, static void dispatch_io(blk_opf_t opf, unsigned int num_regions, struct dm_io_region *where, struct dpages *dp, - struct io *io, int sync, unsigned short ioprio) + struct io *io, unsigned short ioprio) { int i; struct dpages old_pages = *dp; BUG_ON(num_regions > DM_IO_MAX_REGIONS); - if (sync) - opf |= REQ_SYNC; - /* * For multiple regions we need to be careful to rewind * the dp object for each call to do_region. @@ -411,6 +408,26 @@ static void dispatch_io(blk_opf_t opf, unsigned int num_regions, dec_count(io, 0, 0); } +static void async_io(struct dm_io_client *client, unsigned int num_regions, + struct dm_io_region *where, blk_opf_t opf, + struct dpages *dp, io_notify_fn fn, void *context, + unsigned short ioprio) +{ + struct io *io; + + io = mempool_alloc(&client->pool, GFP_NOIO); + io->error_bits = 0; + atomic_set(&io->count, 1); /* see dispatch_io() */ + io->client = client; + io->callback = fn; + io->context = context; + + io->vma_invalidate_address = dp->vma_invalidate_address; + io->vma_invalidate_size = dp->vma_invalidate_size; + + dispatch_io(opf, num_regions, where, dp, io, ioprio); +} + struct sync_io { unsigned long error_bits; struct completion wait; @@ -428,27 +445,12 @@ static int sync_io(struct dm_io_client *client, unsigned int num_regions, struct dm_io_region *where, blk_opf_t opf, struct dpages *dp, unsigned long *error_bits, unsigned short ioprio) { - struct io *io; struct sync_io sio; - if (num_regions > 1 && !op_is_write(opf)) { - WARN_ON(1); - return -EIO; - } - init_completion(&sio.wait); - io = mempool_alloc(&client->pool, GFP_NOIO); - io->error_bits = 0; - atomic_set(&io->count, 1); /* see dispatch_io() */ - io->client = client; - io->callback = sync_io_complete; - io->context = &sio; - - io->vma_invalidate_address = dp->vma_invalidate_address; - io->vma_invalidate_size = dp->vma_invalidate_size; - - dispatch_io(opf, num_regions, where, dp, io, 1, ioprio); + async_io(client, num_regions, where, opf | REQ_SYNC, dp, + sync_io_complete, &sio, ioprio); wait_for_completion_io(&sio.wait); @@ -458,33 +460,6 @@ static int sync_io(struct dm_io_client *client, unsigned int num_regions, return sio.error_bits ? -EIO : 0; } -static int async_io(struct dm_io_client *client, unsigned int num_regions, - struct dm_io_region *where, blk_opf_t opf, - struct dpages *dp, io_notify_fn fn, void *context, - unsigned short ioprio) -{ - struct io *io; - - if (num_regions > 1 && !op_is_write(opf)) { - WARN_ON(1); - fn(1, context); - return -EIO; - } - - io = mempool_alloc(&client->pool, GFP_NOIO); - io->error_bits = 0; - atomic_set(&io->count, 1); /* see dispatch_io() */ - io->client = client; - io->callback = fn; - io->context = context; - - io->vma_invalidate_address = dp->vma_invalidate_address; - io->vma_invalidate_size = dp->vma_invalidate_size; - - dispatch_io(opf, num_regions, where, dp, io, 0, ioprio); - return 0; -} - static int dp_init(struct dm_io_request *io_req, struct dpages *dp, unsigned long size) { @@ -529,6 +504,11 @@ int dm_io(struct dm_io_request *io_req, unsigned int num_regions, int r; struct dpages dp; + if (num_regions > 1 && !op_is_write(io_req->bi_opf)) { + WARN_ON(1); + return -EIO; + } + r = dp_init(io_req, &dp, (unsigned long)where->count << SECTOR_SHIFT); if (r) return r; @@ -537,9 +517,9 @@ int dm_io(struct dm_io_request *io_req, unsigned int num_regions, return sync_io(io_req->client, num_regions, where, io_req->bi_opf, &dp, sync_error_bits, ioprio); - return async_io(io_req->client, num_regions, where, - io_req->bi_opf, &dp, io_req->notify.fn, - io_req->notify.context, ioprio); + async_io(io_req->client, num_regions, where, io_req->bi_opf, &dp, + io_req->notify.fn, io_req->notify.context, ioprio); + return 0; } EXPORT_SYMBOL(dm_io); diff --git a/drivers/md/dm-linear.c b/drivers/md/dm-linear.c index 2d3e186ca87e..49fb0f684193 100644 --- a/drivers/md/dm-linear.c +++ b/drivers/md/dm-linear.c @@ -62,6 +62,7 @@ static int linear_ctr(struct dm_target *ti, unsigned int argc, char **argv) ti->num_discard_bios = 1; ti->num_secure_erase_bios = 1; ti->num_write_zeroes_bios = 1; + ti->flush_bypasses_map = true; ti->private = lc; return 0; diff --git a/drivers/md/dm-mpath.c b/drivers/md/dm-mpath.c index 15b681b90153..637977acc3dc 100644 --- a/drivers/md/dm-mpath.c +++ b/drivers/md/dm-mpath.c @@ -1419,8 +1419,7 @@ out: /* * Fail or reinstate all paths that match the provided struct dm_dev. */ -static int action_dev(struct multipath *m, struct dm_dev *dev, - action_fn action) +static int action_dev(struct multipath *m, dev_t dev, action_fn action) { int r = -EINVAL; struct pgpath *pgpath; @@ -1428,7 +1427,7 @@ static int action_dev(struct multipath *m, struct dm_dev *dev, list_for_each_entry(pg, &m->priority_groups, list) { list_for_each_entry(pgpath, &pg->pgpaths, list) { - if (pgpath->path.dev == dev) + if (pgpath->path.dev->bdev->bd_dev == dev) r = action(pgpath); } } @@ -1959,7 +1958,7 @@ static int multipath_message(struct dm_target *ti, unsigned int argc, char **arg char *result, unsigned int maxlen) { int r = -EINVAL; - struct dm_dev *dev; + dev_t dev; struct multipath *m = ti->private; action_fn action; unsigned long flags; @@ -2008,7 +2007,7 @@ static int multipath_message(struct dm_target *ti, unsigned int argc, char **arg goto out; } - r = dm_get_device(ti, argv[1], dm_table_get_mode(ti->table), &dev); + r = dm_devt_from_path(argv[1], &dev); if (r) { DMWARN("message: error getting device %s", argv[1]); @@ -2017,8 +2016,6 @@ static int multipath_message(struct dm_target *ti, unsigned int argc, char **arg r = action_dev(m, dev, action); - dm_put_device(ti, dev); - out: mutex_unlock(&m->work_mutex); return r; diff --git a/drivers/md/dm-raid.c b/drivers/md/dm-raid.c index abe88d1e6735..0c3323e0adb2 100644 --- a/drivers/md/dm-raid.c +++ b/drivers/md/dm-raid.c @@ -1626,6 +1626,23 @@ static int _check_data_dev_sectors(struct raid_set *rs) return 0; } +/* Get reshape sectors from data_offsets or raid set */ +static sector_t _get_reshape_sectors(struct raid_set *rs) +{ + struct md_rdev *rdev; + sector_t reshape_sectors = 0; + + rdev_for_each(rdev, &rs->md) + if (!test_bit(Journal, &rdev->flags)) { + reshape_sectors = (rdev->data_offset > rdev->new_data_offset) ? + rdev->data_offset - rdev->new_data_offset : + rdev->new_data_offset - rdev->data_offset; + break; + } + + return max(reshape_sectors, (sector_t) rs->data_offset); +} + /* Calculate the sectors per device and per array used for @rs */ static int rs_set_dev_and_array_sectors(struct raid_set *rs, sector_t sectors, bool use_mddev) { @@ -1656,7 +1673,7 @@ static int rs_set_dev_and_array_sectors(struct raid_set *rs, sector_t sectors, b if (sector_div(dev_sectors, data_stripes)) goto bad; - array_sectors = (data_stripes + delta_disks) * dev_sectors; + array_sectors = (data_stripes + delta_disks) * (dev_sectors - _get_reshape_sectors(rs)); if (sector_div(array_sectors, rs->raid10_copies)) goto bad; @@ -1665,7 +1682,7 @@ static int rs_set_dev_and_array_sectors(struct raid_set *rs, sector_t sectors, b else /* Striped layouts */ - array_sectors = (data_stripes + delta_disks) * dev_sectors; + array_sectors = (data_stripes + delta_disks) * (dev_sectors - _get_reshape_sectors(rs)); mddev->array_sectors = array_sectors; mddev->dev_sectors = dev_sectors; @@ -1704,11 +1721,20 @@ static void do_table_event(struct work_struct *ws) struct raid_set *rs = container_of(ws, struct raid_set, md.event_work); smp_rmb(); /* Make sure we access most actual mddev properties */ - if (!rs_is_reshaping(rs)) { + + /* Only grow size resulting from added stripe(s) after reshape ended. */ + if (!rs_is_reshaping(rs) && + rs->array_sectors > rs->md.array_sectors && + !rs->md.delta_disks && + rs->md.raid_disks == rs->raid_disks) { + /* The raid10 personality doesn't provide proper device sizes -> correct. */ if (rs_is_raid10(rs)) rs_set_rdev_sectors(rs); + + rs->md.array_sectors = rs->array_sectors; rs_set_capacity(rs); } + dm_table_event(rs->ti->table); } @@ -2811,23 +2837,6 @@ static int rs_prepare_reshape(struct raid_set *rs) return 0; } -/* Get reshape sectors from data_offsets or raid set */ -static sector_t _get_reshape_sectors(struct raid_set *rs) -{ - struct md_rdev *rdev; - sector_t reshape_sectors = 0; - - rdev_for_each(rdev, &rs->md) - if (!test_bit(Journal, &rdev->flags)) { - reshape_sectors = (rdev->data_offset > rdev->new_data_offset) ? - rdev->data_offset - rdev->new_data_offset : - rdev->new_data_offset - rdev->data_offset; - break; - } - - return max(reshape_sectors, (sector_t) rs->data_offset); -} - /* * Reshape: * - change raid layout @@ -3542,7 +3551,7 @@ static void raid_status(struct dm_target *ti, status_type_t type, recovery = rs->md.recovery; state = decipher_sync_action(mddev, recovery); progress = rs_get_progress(rs, recovery, state, resync_max_sectors); - resync_mismatches = (mddev->last_sync_action && !strcasecmp(mddev->last_sync_action, "check")) ? + resync_mismatches = mddev->last_sync_action == ACTION_CHECK ? atomic64_read(&mddev->resync_mismatches) : 0; /* HM FIXME: do we want another state char for raid0? It shows 'D'/'A'/'-' now */ @@ -3802,8 +3811,8 @@ static void raid_io_hints(struct dm_target *ti, struct queue_limits *limits) struct raid_set *rs = ti->private; unsigned int chunk_size_bytes = to_bytes(rs->md.chunk_sectors); - blk_limits_io_min(limits, chunk_size_bytes); - blk_limits_io_opt(limits, chunk_size_bytes * mddev_data_stripes(rs)); + limits->io_min = chunk_size_bytes; + limits->io_opt = chunk_size_bytes * mddev_data_stripes(rs); } static void raid_presuspend(struct dm_target *ti) @@ -4023,6 +4032,11 @@ static int raid_preresume(struct dm_target *ti) if (test_and_set_bit(RT_FLAG_RS_PRERESUMED, &rs->runtime_flags)) return 0; + /* If different and no explicit grow request, expose MD array size as of superblock. */ + if (!test_bit(RT_FLAG_RS_GROW, &rs->runtime_flags) && + rs->array_sectors != mddev->array_sectors) + rs_set_capacity(rs); + /* * The superblocks need to be updated on disk if the * array is new or new devices got added (thus zeroed @@ -4101,10 +4115,11 @@ static void raid_resume(struct dm_target *ti) if (mddev->delta_disks < 0) rs_set_capacity(rs); + mddev_lock_nointr(mddev); WARN_ON_ONCE(!test_bit(MD_RECOVERY_FROZEN, &mddev->recovery)); - WARN_ON_ONCE(test_bit(MD_RECOVERY_RUNNING, &mddev->recovery)); + WARN_ON_ONCE(rcu_dereference_protected(mddev->sync_thread, + lockdep_is_held(&mddev->reconfig_mutex))); clear_bit(RT_FLAG_RS_FROZEN, &rs->runtime_flags); - mddev_lock_nointr(mddev); mddev->ro = 0; mddev->in_sync = 0; md_unfrozen_sync_thread(mddev); diff --git a/drivers/md/dm-stripe.c b/drivers/md/dm-stripe.c index 16b93ae51d96..4112071de0be 100644 --- a/drivers/md/dm-stripe.c +++ b/drivers/md/dm-stripe.c @@ -157,6 +157,7 @@ static int stripe_ctr(struct dm_target *ti, unsigned int argc, char **argv) ti->num_discard_bios = stripes; ti->num_secure_erase_bios = stripes; ti->num_write_zeroes_bios = stripes; + ti->flush_bypasses_map = true; sc->chunk_size = chunk_size; if (chunk_size & (chunk_size - 1)) @@ -458,8 +459,8 @@ static void stripe_io_hints(struct dm_target *ti, struct stripe_c *sc = ti->private; unsigned int chunk_size = sc->chunk_size << SECTOR_SHIFT; - blk_limits_io_min(limits, chunk_size); - blk_limits_io_opt(limits, chunk_size * sc->stripes); + limits->io_min = chunk_size; + limits->io_opt = chunk_size * sc->stripes; } static struct target_type stripe_target = { diff --git a/drivers/md/dm-table.c b/drivers/md/dm-table.c index b2d5246cff21..dbd39b9722b9 100644 --- a/drivers/md/dm-table.c +++ b/drivers/md/dm-table.c @@ -160,6 +160,7 @@ int dm_table_create(struct dm_table **result, blk_mode_t mode, t->type = DM_TYPE_NONE; t->mode = mode; t->md = md; + t->flush_bypasses_map = true; *result = t; return 0; } @@ -330,23 +331,15 @@ static int upgrade_mode(struct dm_dev_internal *dd, blk_mode_t new_mode, } /* - * Add a device to the list, or just increment the usage count if - * it's already present. - * * Note: the __ref annotation is because this function can call the __init * marked early_lookup_bdev when called during early boot code from dm-init.c. */ -int __ref dm_get_device(struct dm_target *ti, const char *path, blk_mode_t mode, - struct dm_dev **result) +int __ref dm_devt_from_path(const char *path, dev_t *dev_p) { int r; dev_t dev; unsigned int major, minor; char dummy; - struct dm_dev_internal *dd; - struct dm_table *t = ti->table; - - BUG_ON(!t); if (sscanf(path, "%u:%u%c", &major, &minor, &dummy) == 2) { /* Extract the major/minor numbers */ @@ -362,6 +355,29 @@ int __ref dm_get_device(struct dm_target *ti, const char *path, blk_mode_t mode, if (r) return r; } + *dev_p = dev; + return 0; +} +EXPORT_SYMBOL(dm_devt_from_path); + +/* + * Add a device to the list, or just increment the usage count if + * it's already present. + */ +int dm_get_device(struct dm_target *ti, const char *path, blk_mode_t mode, + struct dm_dev **result) +{ + int r; + dev_t dev; + struct dm_dev_internal *dd; + struct dm_table *t = ti->table; + + BUG_ON(!t); + + r = dm_devt_from_path(path, &dev); + if (r) + return r; + if (dev == disk_devt(t->md->disk)) return -EINVAL; @@ -425,6 +441,13 @@ static int dm_set_device_limits(struct dm_target *ti, struct dm_dev *dev, q->limits.logical_block_size, q->limits.alignment_offset, (unsigned long long) start << SECTOR_SHIFT); + + /* + * Only stack the integrity profile if the target doesn't have native + * integrity support. + */ + if (!dm_target_has_integrity(ti->type)) + queue_limits_stack_integrity_bdev(limits, bdev); return 0; } @@ -572,6 +595,12 @@ int dm_split_args(int *argc, char ***argvp, char *input) return 0; } +static void dm_set_stacking_limits(struct queue_limits *limits) +{ + blk_set_stacking_limits(limits); + limits->features |= BLK_FEAT_IO_STAT | BLK_FEAT_NOWAIT | BLK_FEAT_POLL; +} + /* * Impose necessary and sufficient conditions on a devices's table such * that any incoming bio which respects its logical_block_size can be @@ -610,7 +639,7 @@ static int validate_hardware_logical_block_alignment(struct dm_table *t, for (i = 0; i < t->num_targets; i++) { ti = dm_table_get_target(t, i); - blk_set_stacking_limits(&ti_limits); + dm_set_stacking_limits(&ti_limits); /* combine all target devices' limits */ if (ti->type->iterate_devices) @@ -702,9 +731,6 @@ int dm_table_add_target(struct dm_table *t, const char *type, t->immutable_target_type = ti->type; } - if (dm_target_has_integrity(ti->type)) - t->integrity_added = 1; - ti->table = t; ti->begin = start; ti->len = len; @@ -738,6 +764,9 @@ int dm_table_add_target(struct dm_table *t, const char *type, if (ti->limit_swap_bios && !static_key_enabled(&swap_bios_enabled.key)) static_branch_enable(&swap_bios_enabled); + if (!ti->flush_bypasses_map) + t->flush_bypasses_map = false; + return 0; bad: @@ -1014,14 +1043,14 @@ bool dm_table_request_based(struct dm_table *t) return __table_type_request_based(dm_table_get_type(t)); } -static bool dm_table_supports_poll(struct dm_table *t); - static int dm_table_alloc_md_mempools(struct dm_table *t, struct mapped_device *md) { enum dm_queue_mode type = dm_table_get_type(t); unsigned int per_io_data_size = 0, front_pad, io_front_pad; unsigned int min_pool_size = 0, pool_size; struct dm_md_mempools *pools; + unsigned int bioset_flags = 0; + bool mempool_needs_integrity = t->integrity_supported; if (unlikely(type == DM_TYPE_NONE)) { DMERR("no table type is set, can't allocate mempools"); @@ -1038,11 +1067,16 @@ static int dm_table_alloc_md_mempools(struct dm_table *t, struct mapped_device * goto init_bs; } + if (md->queue->limits.features & BLK_FEAT_POLL) + bioset_flags |= BIOSET_PERCPU_CACHE; + for (unsigned int i = 0; i < t->num_targets; i++) { struct dm_target *ti = dm_table_get_target(t, i); per_io_data_size = max(per_io_data_size, ti->per_io_data_size); min_pool_size = max(min_pool_size, ti->num_flush_bios); + + mempool_needs_integrity |= ti->mempool_needs_integrity; } pool_size = max(dm_get_reserved_bio_based_ios(), min_pool_size); front_pad = roundup(per_io_data_size, @@ -1050,16 +1084,15 @@ static int dm_table_alloc_md_mempools(struct dm_table *t, struct mapped_device * io_front_pad = roundup(per_io_data_size, __alignof__(struct dm_io)) + DM_IO_BIO_OFFSET; - if (bioset_init(&pools->io_bs, pool_size, io_front_pad, - dm_table_supports_poll(t) ? BIOSET_PERCPU_CACHE : 0)) + if (bioset_init(&pools->io_bs, pool_size, io_front_pad, bioset_flags)) goto out_free_pools; - if (t->integrity_supported && + if (mempool_needs_integrity && bioset_integrity_create(&pools->io_bs, pool_size)) goto out_free_pools; init_bs: if (bioset_init(&pools->bs, pool_size, front_pad, 0)) goto out_free_pools; - if (t->integrity_supported && + if (mempool_needs_integrity && bioset_integrity_create(&pools->bs, pool_size)) goto out_free_pools; @@ -1119,99 +1152,6 @@ static int dm_table_build_index(struct dm_table *t) return r; } -static bool integrity_profile_exists(struct gendisk *disk) -{ - return !!blk_get_integrity(disk); -} - -/* - * Get a disk whose integrity profile reflects the table's profile. - * Returns NULL if integrity support was inconsistent or unavailable. - */ -static struct gendisk *dm_table_get_integrity_disk(struct dm_table *t) -{ - struct list_head *devices = dm_table_get_devices(t); - struct dm_dev_internal *dd = NULL; - struct gendisk *prev_disk = NULL, *template_disk = NULL; - - for (unsigned int i = 0; i < t->num_targets; i++) { - struct dm_target *ti = dm_table_get_target(t, i); - - if (!dm_target_passes_integrity(ti->type)) - goto no_integrity; - } - - list_for_each_entry(dd, devices, list) { - template_disk = dd->dm_dev->bdev->bd_disk; - if (!integrity_profile_exists(template_disk)) - goto no_integrity; - else if (prev_disk && - blk_integrity_compare(prev_disk, template_disk) < 0) - goto no_integrity; - prev_disk = template_disk; - } - - return template_disk; - -no_integrity: - if (prev_disk) - DMWARN("%s: integrity not set: %s and %s profile mismatch", - dm_device_name(t->md), - prev_disk->disk_name, - template_disk->disk_name); - return NULL; -} - -/* - * Register the mapped device for blk_integrity support if the - * underlying devices have an integrity profile. But all devices may - * not have matching profiles (checking all devices isn't reliable - * during table load because this table may use other DM device(s) which - * must be resumed before they will have an initialized integity - * profile). Consequently, stacked DM devices force a 2 stage integrity - * profile validation: First pass during table load, final pass during - * resume. - */ -static int dm_table_register_integrity(struct dm_table *t) -{ - struct mapped_device *md = t->md; - struct gendisk *template_disk = NULL; - - /* If target handles integrity itself do not register it here. */ - if (t->integrity_added) - return 0; - - template_disk = dm_table_get_integrity_disk(t); - if (!template_disk) - return 0; - - if (!integrity_profile_exists(dm_disk(md))) { - t->integrity_supported = true; - /* - * Register integrity profile during table load; we can do - * this because the final profile must match during resume. - */ - blk_integrity_register(dm_disk(md), - blk_get_integrity(template_disk)); - return 0; - } - - /* - * If DM device already has an initialized integrity - * profile the new profile should not conflict. - */ - if (blk_integrity_compare(dm_disk(md), template_disk) < 0) { - DMERR("%s: conflict with existing integrity profile: %s profile mismatch", - dm_device_name(t->md), - template_disk->disk_name); - return 1; - } - - /* Preserve existing integrity profile */ - t->integrity_supported = true; - return 0; -} - #ifdef CONFIG_BLK_INLINE_ENCRYPTION struct dm_crypto_profile { @@ -1423,12 +1363,6 @@ int dm_table_complete(struct dm_table *t) return r; } - r = dm_table_register_integrity(t); - if (r) { - DMERR("could not register integrity profile."); - return r; - } - r = dm_table_construct_crypto_profile(t); if (r) { DMERR("could not construct crypto profile."); @@ -1493,14 +1427,6 @@ struct dm_target *dm_table_find_target(struct dm_table *t, sector_t sector) return &t->targets[(KEYS_PER_NODE * n) + k]; } -static int device_not_poll_capable(struct dm_target *ti, struct dm_dev *dev, - sector_t start, sector_t len, void *data) -{ - struct request_queue *q = bdev_get_queue(dev->bdev); - - return !test_bit(QUEUE_FLAG_POLL, &q->queue_flags); -} - /* * type->iterate_devices() should be called when the sanity check needs to * iterate and check all underlying data devices. iterate_devices() will @@ -1548,19 +1474,6 @@ static int count_device(struct dm_target *ti, struct dm_dev *dev, return 0; } -static bool dm_table_supports_poll(struct dm_table *t) -{ - for (unsigned int i = 0; i < t->num_targets; i++) { - struct dm_target *ti = dm_table_get_target(t, i); - - if (!ti->type->iterate_devices || - ti->type->iterate_devices(ti, device_not_poll_capable, NULL)) - return false; - } - - return true; -} - /* * Check whether a table has no data devices attached using each * target's iterate_devices method. @@ -1686,12 +1599,20 @@ int dm_calculate_queue_limits(struct dm_table *t, unsigned int zone_sectors = 0; bool zoned = false; - blk_set_stacking_limits(limits); + dm_set_stacking_limits(limits); + t->integrity_supported = true; for (unsigned int i = 0; i < t->num_targets; i++) { struct dm_target *ti = dm_table_get_target(t, i); - blk_set_stacking_limits(&ti_limits); + if (!dm_target_passes_integrity(ti->type)) + t->integrity_supported = false; + } + + for (unsigned int i = 0; i < t->num_targets; i++) { + struct dm_target *ti = dm_table_get_target(t, i); + + dm_set_stacking_limits(&ti_limits); if (!ti->type->iterate_devices) { /* Set I/O hints portion of queue limits */ @@ -1706,12 +1627,12 @@ int dm_calculate_queue_limits(struct dm_table *t, ti->type->iterate_devices(ti, dm_set_device_limits, &ti_limits); - if (!zoned && ti_limits.zoned) { + if (!zoned && (ti_limits.features & BLK_FEAT_ZONED)) { /* * After stacking all limits, validate all devices * in table support this zoned model and zone sectors. */ - zoned = ti_limits.zoned; + zoned = (ti_limits.features & BLK_FEAT_ZONED); zone_sectors = ti_limits.chunk_sectors; } @@ -1738,6 +1659,18 @@ combine_limits: dm_device_name(t->md), (unsigned long long) ti->begin, (unsigned long long) ti->len); + + if (t->integrity_supported || + dm_target_has_integrity(ti->type)) { + if (!queue_limits_stack_integrity(limits, &ti_limits)) { + DMWARN("%s: adding target device (start sect %llu len %llu) " + "disabled integrity support due to incompatibility", + dm_device_name(t->md), + (unsigned long long) ti->begin, + (unsigned long long) ti->len); + t->integrity_supported = false; + } + } } /* @@ -1747,12 +1680,12 @@ combine_limits: * zoned model on host-managed zoned block devices. * BUT... */ - if (limits->zoned) { + if (limits->features & BLK_FEAT_ZONED) { /* * ...IF the above limits stacking determined a zoned model * validate that all of the table's devices conform to it. */ - zoned = limits->zoned; + zoned = limits->features & BLK_FEAT_ZONED; zone_sectors = limits->chunk_sectors; } if (validate_hardware_zoned(t, zoned, zone_sectors)) @@ -1762,63 +1695,15 @@ combine_limits: } /* - * Verify that all devices have an integrity profile that matches the - * DM device's registered integrity profile. If the profiles don't - * match then unregister the DM device's integrity profile. + * Check if a target requires flush support even if none of the underlying + * devices need it (e.g. to persist target-specific metadata). */ -static void dm_table_verify_integrity(struct dm_table *t) +static bool dm_table_supports_flush(struct dm_table *t) { - struct gendisk *template_disk = NULL; - - if (t->integrity_added) - return; - - if (t->integrity_supported) { - /* - * Verify that the original integrity profile - * matches all the devices in this table. - */ - template_disk = dm_table_get_integrity_disk(t); - if (template_disk && - blk_integrity_compare(dm_disk(t->md), template_disk) >= 0) - return; - } - - if (integrity_profile_exists(dm_disk(t->md))) { - DMWARN("%s: unable to establish an integrity profile", - dm_device_name(t->md)); - blk_integrity_unregister(dm_disk(t->md)); - } -} - -static int device_flush_capable(struct dm_target *ti, struct dm_dev *dev, - sector_t start, sector_t len, void *data) -{ - unsigned long flush = (unsigned long) data; - struct request_queue *q = bdev_get_queue(dev->bdev); - - return (q->queue_flags & flush); -} - -static bool dm_table_supports_flush(struct dm_table *t, unsigned long flush) -{ - /* - * Require at least one underlying device to support flushes. - * t->devices includes internal dm devices such as mirror logs - * so we need to use iterate_devices here, which targets - * supporting flushes must provide. - */ for (unsigned int i = 0; i < t->num_targets; i++) { struct dm_target *ti = dm_table_get_target(t, i); - if (!ti->num_flush_bios) - continue; - - if (ti->flush_supported) - return true; - - if (ti->type->iterate_devices && - ti->type->iterate_devices(ti, device_flush_capable, (void *) flush)) + if (ti->num_flush_bios && ti->flush_supported) return true; } @@ -1839,20 +1724,6 @@ static int device_dax_write_cache_enabled(struct dm_target *ti, return false; } -static int device_is_rotational(struct dm_target *ti, struct dm_dev *dev, - sector_t start, sector_t len, void *data) -{ - return !bdev_nonrot(dev->bdev); -} - -static int device_is_not_random(struct dm_target *ti, struct dm_dev *dev, - sector_t start, sector_t len, void *data) -{ - struct request_queue *q = bdev_get_queue(dev->bdev); - - return !blk_queue_add_random(q); -} - static int device_not_write_zeroes_capable(struct dm_target *ti, struct dm_dev *dev, sector_t start, sector_t len, void *data) { @@ -1877,12 +1748,6 @@ static bool dm_table_supports_write_zeroes(struct dm_table *t) return true; } -static int device_not_nowait_capable(struct dm_target *ti, struct dm_dev *dev, - sector_t start, sector_t len, void *data) -{ - return !bdev_nowait(dev->bdev); -} - static bool dm_table_supports_nowait(struct dm_table *t) { for (unsigned int i = 0; i < t->num_targets; i++) { @@ -1890,10 +1755,6 @@ static bool dm_table_supports_nowait(struct dm_table *t) if (!dm_target_supports_nowait(ti->type)) return false; - - if (!ti->type->iterate_devices || - ti->type->iterate_devices(ti, device_not_nowait_capable, NULL)) - return false; } return true; @@ -1950,29 +1811,25 @@ static bool dm_table_supports_secure_erase(struct dm_table *t) return true; } -static int device_requires_stable_pages(struct dm_target *ti, - struct dm_dev *dev, sector_t start, - sector_t len, void *data) -{ - return bdev_stable_writes(dev->bdev); -} - int dm_table_set_restrictions(struct dm_table *t, struct request_queue *q, struct queue_limits *limits) { - bool wc = false, fua = false; int r; - if (dm_table_supports_nowait(t)) - blk_queue_flag_set(QUEUE_FLAG_NOWAIT, q); - else - blk_queue_flag_clear(QUEUE_FLAG_NOWAIT, q); + if (!dm_table_supports_nowait(t)) + limits->features &= ~BLK_FEAT_NOWAIT; + + /* + * The current polling impementation does not support request based + * stacking. + */ + if (!__table_type_bio_based(t->type)) + limits->features &= ~BLK_FEAT_POLL; if (!dm_table_supports_discards(t)) { limits->max_hw_discard_sectors = 0; limits->discard_granularity = 0; limits->discard_alignment = 0; - limits->discard_misaligned = 0; } if (!dm_table_supports_write_zeroes(t)) @@ -1981,58 +1838,22 @@ int dm_table_set_restrictions(struct dm_table *t, struct request_queue *q, if (!dm_table_supports_secure_erase(t)) limits->max_secure_erase_sectors = 0; - if (dm_table_supports_flush(t, (1UL << QUEUE_FLAG_WC))) { - wc = true; - if (dm_table_supports_flush(t, (1UL << QUEUE_FLAG_FUA))) - fua = true; - } - blk_queue_write_cache(q, wc, fua); + if (dm_table_supports_flush(t)) + limits->features |= BLK_FEAT_WRITE_CACHE | BLK_FEAT_FUA; if (dm_table_supports_dax(t, device_not_dax_capable)) { - blk_queue_flag_set(QUEUE_FLAG_DAX, q); + limits->features |= BLK_FEAT_DAX; if (dm_table_supports_dax(t, device_not_dax_synchronous_capable)) set_dax_synchronous(t->md->dax_dev); } else - blk_queue_flag_clear(QUEUE_FLAG_DAX, q); + limits->features &= ~BLK_FEAT_DAX; if (dm_table_any_dev_attr(t, device_dax_write_cache_enabled, NULL)) dax_write_cache(t->md->dax_dev, true); - /* Ensure that all underlying devices are non-rotational. */ - if (dm_table_any_dev_attr(t, device_is_rotational, NULL)) - blk_queue_flag_clear(QUEUE_FLAG_NONROT, q); - else - blk_queue_flag_set(QUEUE_FLAG_NONROT, q); - - dm_table_verify_integrity(t); - - /* - * Some devices don't use blk_integrity but still want stable pages - * because they do their own checksumming. - * If any underlying device requires stable pages, a table must require - * them as well. Only targets that support iterate_devices are considered: - * don't want error, zero, etc to require stable pages. - */ - if (dm_table_any_dev_attr(t, device_requires_stable_pages, NULL)) - blk_queue_flag_set(QUEUE_FLAG_STABLE_WRITES, q); - else - blk_queue_flag_clear(QUEUE_FLAG_STABLE_WRITES, q); - - /* - * Determine whether or not this queue's I/O timings contribute - * to the entropy pool, Only request-based targets use this. - * Clear QUEUE_FLAG_ADD_RANDOM if any underlying device does not - * have it set. - */ - if (blk_queue_add_random(q) && - dm_table_any_dev_attr(t, device_is_not_random, NULL)) - blk_queue_flag_clear(QUEUE_FLAG_ADD_RANDOM, q); - - /* - * For a zoned target, setup the zones related queue attributes - * and resources necessary for zone append emulation if necessary. - */ - if (IS_ENABLED(CONFIG_BLK_DEV_ZONED) && limits->zoned) { + /* For a zoned table, setup the zone related queue attributes. */ + if (IS_ENABLED(CONFIG_BLK_DEV_ZONED) && + (limits->features & BLK_FEAT_ZONED)) { r = dm_set_zones_restrictions(t, q, limits); if (r) return r; @@ -2042,22 +1863,18 @@ int dm_table_set_restrictions(struct dm_table *t, struct request_queue *q, if (r) return r; - dm_update_crypto_profile(q, t); - /* - * Check for request-based device is left to - * dm_mq_init_request_queue()->blk_mq_init_allocated_queue(). - * - * For bio-based device, only set QUEUE_FLAG_POLL when all - * underlying devices supporting polling. + * Now that the limits are set, check the zones mapped by the table + * and setup the resources for zone append emulation if necessary. */ - if (__table_type_bio_based(t->type)) { - if (dm_table_supports_poll(t)) - blk_queue_flag_set(QUEUE_FLAG_POLL, q); - else - blk_queue_flag_clear(QUEUE_FLAG_POLL, q); + if (IS_ENABLED(CONFIG_BLK_DEV_ZONED) && + (limits->features & BLK_FEAT_ZONED)) { + r = dm_revalidate_zones(t, q); + if (r) + return r; } + dm_update_crypto_profile(q, t); return 0; } diff --git a/drivers/md/dm-thin-metadata.c b/drivers/md/dm-thin-metadata.c index 6022189c1388..f90679cfec5b 100644 --- a/drivers/md/dm-thin-metadata.c +++ b/drivers/md/dm-thin-metadata.c @@ -249,7 +249,7 @@ struct dm_thin_device { */ #define SUPERBLOCK_CSUM_XOR 160774 -static void sb_prepare_for_write(struct dm_block_validator *v, +static void sb_prepare_for_write(const struct dm_block_validator *v, struct dm_block *b, size_t block_size) { @@ -261,7 +261,7 @@ static void sb_prepare_for_write(struct dm_block_validator *v, SUPERBLOCK_CSUM_XOR)); } -static int sb_check(struct dm_block_validator *v, +static int sb_check(const struct dm_block_validator *v, struct dm_block *b, size_t block_size) { @@ -294,7 +294,7 @@ static int sb_check(struct dm_block_validator *v, return 0; } -static struct dm_block_validator sb_validator = { +static const struct dm_block_validator sb_validator = { .name = "superblock", .prepare_for_write = sb_prepare_for_write, .check = sb_check diff --git a/drivers/md/dm-thin.c b/drivers/md/dm-thin.c index 991f77a5885b..a0c1620e90c8 100644 --- a/drivers/md/dm-thin.c +++ b/drivers/md/dm-thin.c @@ -4079,10 +4079,10 @@ static void pool_io_hints(struct dm_target *ti, struct queue_limits *limits) if (io_opt_sectors < pool->sectors_per_block || !is_factor(io_opt_sectors, pool->sectors_per_block)) { if (is_factor(pool->sectors_per_block, limits->max_sectors)) - blk_limits_io_min(limits, limits->max_sectors << SECTOR_SHIFT); + limits->io_min = limits->max_sectors << SECTOR_SHIFT; else - blk_limits_io_min(limits, pool->sectors_per_block << SECTOR_SHIFT); - blk_limits_io_opt(limits, pool->sectors_per_block << SECTOR_SHIFT); + limits->io_min = pool->sectors_per_block << SECTOR_SHIFT; + limits->io_opt = pool->sectors_per_block << SECTOR_SHIFT; } /* diff --git a/drivers/md/dm-vdo/dedupe.c b/drivers/md/dm-vdo/dedupe.c index 117266e1b3ae..39ac68614419 100644 --- a/drivers/md/dm-vdo/dedupe.c +++ b/drivers/md/dm-vdo/dedupe.c @@ -148,11 +148,6 @@ #include "vdo.h" #include "wait-queue.h" -struct uds_attribute { - struct attribute attr; - const char *(*show_string)(struct hash_zones *hash_zones); -}; - #define DEDUPE_QUERY_TIMER_IDLE 0 #define DEDUPE_QUERY_TIMER_RUNNING 1 #define DEDUPE_QUERY_TIMER_FIRED 2 diff --git a/drivers/md/dm-vdo/dm-vdo-target.c b/drivers/md/dm-vdo/dm-vdo-target.c index b423bec6458b..dd05691e4097 100644 --- a/drivers/md/dm-vdo/dm-vdo-target.c +++ b/drivers/md/dm-vdo/dm-vdo-target.c @@ -928,9 +928,9 @@ static void vdo_io_hints(struct dm_target *ti, struct queue_limits *limits) limits->physical_block_size = VDO_BLOCK_SIZE; /* The minimum io size for random io */ - blk_limits_io_min(limits, VDO_BLOCK_SIZE); + limits->io_min = VDO_BLOCK_SIZE; /* The optimal io size for streamed/sequential io */ - blk_limits_io_opt(limits, VDO_BLOCK_SIZE); + limits->io_opt = VDO_BLOCK_SIZE; /* * Sets the maximum discard size that will be passed into VDO. This value comes from a @@ -945,7 +945,7 @@ static void vdo_io_hints(struct dm_target *ti, struct queue_limits *limits) * The value is used by dm-thin to determine whether to pass down discards. The block layer * splits large discards on this boundary when this is set. */ - limits->max_discard_sectors = + limits->max_hw_discard_sectors = (vdo->device_config->max_discard_blocks * VDO_SECTORS_PER_BLOCK); /* diff --git a/drivers/md/dm-vdo/indexer/index.c b/drivers/md/dm-vdo/indexer/index.c index 1ba767144426..df4934846244 100644 --- a/drivers/md/dm-vdo/indexer/index.c +++ b/drivers/md/dm-vdo/indexer/index.c @@ -197,15 +197,12 @@ static int finish_previous_chapter(struct uds_index *index, u64 current_chapter_ static int swap_open_chapter(struct index_zone *zone) { int result; - struct open_chapter_zone *temporary_chapter; result = finish_previous_chapter(zone->index, zone->newest_virtual_chapter); if (result != UDS_SUCCESS) return result; - temporary_chapter = zone->open_chapter; - zone->open_chapter = zone->writing_chapter; - zone->writing_chapter = temporary_chapter; + swap(zone->open_chapter, zone->writing_chapter); return UDS_SUCCESS; } diff --git a/drivers/md/dm-vdo/int-map.c b/drivers/md/dm-vdo/int-map.c index 3aa438f84ea1..f6fe58e437b3 100644 --- a/drivers/md/dm-vdo/int-map.c +++ b/drivers/md/dm-vdo/int-map.c @@ -96,7 +96,7 @@ struct int_map { size_t size; /** @capacity: The number of neighborhoods in the map. */ size_t capacity; - /* @bucket_count: The number of buckets in the bucket array. */ + /** @bucket_count: The number of buckets in the bucket array. */ size_t bucket_count; /** @buckets: The array of hash buckets. */ struct bucket *buckets; diff --git a/drivers/md/dm-vdo/repair.c b/drivers/md/dm-vdo/repair.c index defc9359f10e..7e0009d2f67d 100644 --- a/drivers/md/dm-vdo/repair.c +++ b/drivers/md/dm-vdo/repair.c @@ -51,6 +51,8 @@ struct recovery_point { bool increment_applied; }; +DEFINE_MIN_HEAP(struct numbered_block_mapping, replay_heap); + struct repair_completion { /* The completion header */ struct vdo_completion completion; @@ -97,7 +99,7 @@ struct repair_completion { * order, then original journal order. This permits efficient iteration over the journal * entries in order. */ - struct min_heap replay_heap; + struct replay_heap replay_heap; /* Fields tracking progress through the journal entries. */ struct numbered_block_mapping *current_entry; struct numbered_block_mapping *current_unfetched_entry; @@ -135,7 +137,7 @@ struct repair_completion { * to sort by slot while still ensuring we replay all entries with the same slot in the exact order * as they appeared in the journal. */ -static bool mapping_is_less_than(const void *item1, const void *item2) +static bool mapping_is_less_than(const void *item1, const void *item2, void __always_unused *args) { const struct numbered_block_mapping *mapping1 = (const struct numbered_block_mapping *) item1; @@ -154,7 +156,7 @@ static bool mapping_is_less_than(const void *item1, const void *item2) return 0; } -static void swap_mappings(void *item1, void *item2) +static void swap_mappings(void *item1, void *item2, void __always_unused *args) { struct numbered_block_mapping *mapping1 = item1; struct numbered_block_mapping *mapping2 = item2; @@ -163,14 +165,13 @@ static void swap_mappings(void *item1, void *item2) } static const struct min_heap_callbacks repair_min_heap = { - .elem_size = sizeof(struct numbered_block_mapping), .less = mapping_is_less_than, .swp = swap_mappings, }; static struct numbered_block_mapping *sort_next_heap_element(struct repair_completion *repair) { - struct min_heap *heap = &repair->replay_heap; + struct replay_heap *heap = &repair->replay_heap; struct numbered_block_mapping *last; if (heap->nr == 0) @@ -181,8 +182,8 @@ static struct numbered_block_mapping *sort_next_heap_element(struct repair_compl * restore the heap invariant, and return a pointer to the popped element. */ last = &repair->entries[--heap->nr]; - swap_mappings(heap->data, last); - min_heapify(heap, 0, &repair_min_heap); + swap_mappings(heap->data, last, NULL); + min_heap_sift_down(heap, 0, &repair_min_heap, NULL); return last; } @@ -318,6 +319,7 @@ static bool __must_check abort_on_error(int result, struct repair_completion *re /** * drain_slab_depot() - Flush out all dirty refcounts blocks now that they have been rebuilt or * recovered. + * @completion: The repair completion. */ static void drain_slab_depot(struct vdo_completion *completion) { @@ -653,9 +655,6 @@ static void rebuild_reference_counts(struct vdo_completion *completion) vdo_traverse_forest(vdo->block_map, process_entry, completion); } -/** - * increment_recovery_point() - Move the given recovery point forward by one entry. - */ static void increment_recovery_point(struct recovery_point *point) { if (++point->entry_count < RECOVERY_JOURNAL_ENTRIES_PER_SECTOR) @@ -952,6 +951,7 @@ static void abort_block_map_recovery(struct repair_completion *repair, int resul /** * find_entry_starting_next_page() - Find the first journal entry after a given entry which is not * on the same block map page. + * @repair: The repair completion. * @current_entry: The entry to search from. * @needs_sort: Whether sorting is needed to proceed. * @@ -1117,12 +1117,12 @@ static void recover_block_map(struct vdo_completion *completion) * Organize the journal entries into a binary heap so we can iterate over them in sorted * order incrementally, avoiding an expensive sort call. */ - repair->replay_heap = (struct min_heap) { + repair->replay_heap = (struct replay_heap) { .data = repair->entries, .nr = repair->block_map_entry_count, .size = repair->block_map_entry_count, }; - min_heapify_all(&repair->replay_heap, &repair_min_heap); + min_heapify_all(&repair->replay_heap, &repair_min_heap, NULL); vdo_log_info("Replaying %zu recovery entries into block map", repair->block_map_entry_count); @@ -1218,6 +1218,7 @@ static bool __must_check is_exact_recovery_journal_block(const struct recovery_j /** * find_recovery_journal_head_and_tail() - Find the tail and head of the journal. + * @repair: The repair completion. * * Return: True if there were valid journal blocks. */ @@ -1446,6 +1447,7 @@ static int validate_heads(struct repair_completion *repair) /** * extract_new_mappings() - Find all valid new mappings to be applied to the block map. + * @repair: The repair completion. * * The mappings are extracted from the journal and stored in a sortable array so that all of the * mappings to be applied to a given block map page can be done in a single page fetch. @@ -1500,6 +1502,7 @@ static int extract_new_mappings(struct repair_completion *repair) /** * compute_usages() - Compute the lbns in use and block map data blocks counts from the tail of * the journal. + * @repair: The repair completion. */ static noinline int compute_usages(struct repair_completion *repair) { diff --git a/drivers/md/dm-vdo/slab-depot.c b/drivers/md/dm-vdo/slab-depot.c index 46e4721e5b4f..274f9ccd072f 100644 --- a/drivers/md/dm-vdo/slab-depot.c +++ b/drivers/md/dm-vdo/slab-depot.c @@ -3288,7 +3288,8 @@ int vdo_release_block_reference(struct block_allocator *allocator, * Thus, the ordering is reversed from the usual sense since min_heap returns smaller elements * before larger ones. */ -static bool slab_status_is_less_than(const void *item1, const void *item2) +static bool slab_status_is_less_than(const void *item1, const void *item2, + void __always_unused *args) { const struct slab_status *info1 = item1; const struct slab_status *info2 = item2; @@ -3300,7 +3301,7 @@ static bool slab_status_is_less_than(const void *item1, const void *item2) return info1->slab_number < info2->slab_number; } -static void swap_slab_statuses(void *item1, void *item2) +static void swap_slab_statuses(void *item1, void *item2, void __always_unused *args) { struct slab_status *info1 = item1; struct slab_status *info2 = item2; @@ -3309,7 +3310,6 @@ static void swap_slab_statuses(void *item1, void *item2) } static const struct min_heap_callbacks slab_status_min_heap = { - .elem_size = sizeof(struct slab_status), .less = slab_status_is_less_than, .swp = swap_slab_statuses, }; @@ -3509,7 +3509,7 @@ static int get_slab_statuses(struct block_allocator *allocator, static int __must_check vdo_prepare_slabs_for_allocation(struct block_allocator *allocator) { struct slab_status current_slab_status; - struct min_heap heap; + DEFINE_MIN_HEAP(struct slab_status, heap) heap; int result; struct slab_status *slab_statuses; struct slab_depot *depot = allocator->depot; @@ -3521,12 +3521,12 @@ static int __must_check vdo_prepare_slabs_for_allocation(struct block_allocator return result; /* Sort the slabs by cleanliness, then by emptiness hint. */ - heap = (struct min_heap) { + heap = (struct heap) { .data = slab_statuses, .nr = allocator->slab_count, .size = allocator->slab_count, }; - min_heapify_all(&heap, &slab_status_min_heap); + min_heapify_all(&heap, &slab_status_min_heap, NULL); while (heap.nr > 0) { bool high_priority; @@ -3534,7 +3534,7 @@ static int __must_check vdo_prepare_slabs_for_allocation(struct block_allocator struct slab_journal *journal; current_slab_status = slab_statuses[0]; - min_heap_pop(&heap, &slab_status_min_heap); + min_heap_pop(&heap, &slab_status_min_heap, NULL); slab = depot->slabs[current_slab_status.slab_number]; if ((depot->load_type == VDO_SLAB_DEPOT_REBUILD_LOAD) || diff --git a/drivers/md/dm-verity-fec.c b/drivers/md/dm-verity-fec.c index e46aee6f932e..62b1a44b8dd2 100644 --- a/drivers/md/dm-verity-fec.c +++ b/drivers/md/dm-verity-fec.c @@ -186,8 +186,7 @@ error: static int fec_is_erasure(struct dm_verity *v, struct dm_verity_io *io, u8 *want_digest, u8 *data) { - if (unlikely(verity_hash(v, verity_io_hash_req(v, io), - data, 1 << v->data_dev_block_bits, + if (unlikely(verity_hash(v, io, data, 1 << v->data_dev_block_bits, verity_io_real_digest(v, io), true))) return 0; @@ -388,8 +387,7 @@ static int fec_decode_rsb(struct dm_verity *v, struct dm_verity_io *io, } /* Always re-validate the corrected block against the expected hash */ - r = verity_hash(v, verity_io_hash_req(v, io), fio->output, - 1 << v->data_dev_block_bits, + r = verity_hash(v, io, fio->output, 1 << v->data_dev_block_bits, verity_io_real_digest(v, io), true); if (unlikely(r < 0)) return r; @@ -404,24 +402,9 @@ static int fec_decode_rsb(struct dm_verity *v, struct dm_verity_io *io, return 0; } -static int fec_bv_copy(struct dm_verity *v, struct dm_verity_io *io, u8 *data, - size_t len) -{ - struct dm_verity_fec_io *fio = fec_io(io); - - memcpy(data, &fio->output[fio->output_pos], len); - fio->output_pos += len; - - return 0; -} - -/* - * Correct errors in a block. Copies corrected block to dest if non-NULL, - * otherwise to a bio_vec starting from iter. - */ +/* Correct errors in a block. Copies corrected block to dest. */ int verity_fec_decode(struct dm_verity *v, struct dm_verity_io *io, - enum verity_block_type type, sector_t block, u8 *dest, - struct bvec_iter *iter) + enum verity_block_type type, sector_t block, u8 *dest) { int r; struct dm_verity_fec_io *fio = fec_io(io); @@ -471,12 +454,7 @@ int verity_fec_decode(struct dm_verity *v, struct dm_verity_io *io, goto done; } - if (dest) - memcpy(dest, fio->output, 1 << v->data_dev_block_bits); - else if (iter) { - fio->output_pos = 0; - r = verity_for_bv_block(v, io, iter, fec_bv_copy); - } + memcpy(dest, fio->output, 1 << v->data_dev_block_bits); done: fio->level--; diff --git a/drivers/md/dm-verity-fec.h b/drivers/md/dm-verity-fec.h index 8454070d2824..09123a612953 100644 --- a/drivers/md/dm-verity-fec.h +++ b/drivers/md/dm-verity-fec.h @@ -57,7 +57,6 @@ struct dm_verity_fec_io { u8 *bufs[DM_VERITY_FEC_BUF_MAX]; /* bufs for deinterleaving */ unsigned int nbufs; /* number of buffers allocated */ u8 *output; /* buffer for corrected output */ - size_t output_pos; unsigned int level; /* recursion level */ }; @@ -70,7 +69,7 @@ extern bool verity_fec_is_enabled(struct dm_verity *v); extern int verity_fec_decode(struct dm_verity *v, struct dm_verity_io *io, enum verity_block_type type, sector_t block, - u8 *dest, struct bvec_iter *iter); + u8 *dest); extern unsigned int verity_fec_status_table(struct dm_verity *v, unsigned int sz, char *result, unsigned int maxlen); @@ -100,8 +99,7 @@ static inline bool verity_fec_is_enabled(struct dm_verity *v) static inline int verity_fec_decode(struct dm_verity *v, struct dm_verity_io *io, enum verity_block_type type, - sector_t block, u8 *dest, - struct bvec_iter *iter) + sector_t block, u8 *dest) { return -EOPNOTSUPP; } diff --git a/drivers/md/dm-verity-target.c b/drivers/md/dm-verity-target.c index bb5da66da4c1..cf659c8feb29 100644 --- a/drivers/md/dm-verity-target.c +++ b/drivers/md/dm-verity-target.c @@ -48,6 +48,9 @@ module_param_named(prefetch_cluster, dm_verity_prefetch_cluster, uint, 0644); static DEFINE_STATIC_KEY_FALSE(use_bh_wq_enabled); +/* Is at least one dm-verity instance using ahash_tfm instead of shash_tfm? */ +static DEFINE_STATIC_KEY_FALSE(ahash_enabled); + struct dm_verity_prefetch_work { struct work_struct work; struct dm_verity *v; @@ -102,7 +105,7 @@ static sector_t verity_position_at_level(struct dm_verity *v, sector_t block, return block >> (level * v->hash_per_block_bits); } -static int verity_hash_update(struct dm_verity *v, struct ahash_request *req, +static int verity_ahash_update(struct dm_verity *v, struct ahash_request *req, const u8 *data, size_t len, struct crypto_wait *wait) { @@ -135,12 +138,12 @@ static int verity_hash_update(struct dm_verity *v, struct ahash_request *req, /* * Wrapper for crypto_ahash_init, which handles verity salting. */ -static int verity_hash_init(struct dm_verity *v, struct ahash_request *req, +static int verity_ahash_init(struct dm_verity *v, struct ahash_request *req, struct crypto_wait *wait, bool may_sleep) { int r; - ahash_request_set_tfm(req, v->tfm); + ahash_request_set_tfm(req, v->ahash_tfm); ahash_request_set_callback(req, may_sleep ? CRYPTO_TFM_REQ_MAY_SLEEP | CRYPTO_TFM_REQ_MAY_BACKLOG : 0, crypto_req_done, (void *)wait); @@ -155,18 +158,18 @@ static int verity_hash_init(struct dm_verity *v, struct ahash_request *req, } if (likely(v->salt_size && (v->version >= 1))) - r = verity_hash_update(v, req, v->salt, v->salt_size, wait); + r = verity_ahash_update(v, req, v->salt, v->salt_size, wait); return r; } -static int verity_hash_final(struct dm_verity *v, struct ahash_request *req, - u8 *digest, struct crypto_wait *wait) +static int verity_ahash_final(struct dm_verity *v, struct ahash_request *req, + u8 *digest, struct crypto_wait *wait) { int r; if (unlikely(v->salt_size && (!v->version))) { - r = verity_hash_update(v, req, v->salt, v->salt_size, wait); + r = verity_ahash_update(v, req, v->salt, v->salt_size, wait); if (r < 0) { DMERR("%s failed updating salt: %d", __func__, r); @@ -180,23 +183,27 @@ out: return r; } -int verity_hash(struct dm_verity *v, struct ahash_request *req, +int verity_hash(struct dm_verity *v, struct dm_verity_io *io, const u8 *data, size_t len, u8 *digest, bool may_sleep) { int r; - struct crypto_wait wait; - - r = verity_hash_init(v, req, &wait, may_sleep); - if (unlikely(r < 0)) - goto out; - r = verity_hash_update(v, req, data, len, &wait); - if (unlikely(r < 0)) - goto out; + if (static_branch_unlikely(&ahash_enabled) && !v->shash_tfm) { + struct ahash_request *req = verity_io_hash_req(v, io); + struct crypto_wait wait; - r = verity_hash_final(v, req, digest, &wait); + r = verity_ahash_init(v, req, &wait, may_sleep) ?: + verity_ahash_update(v, req, data, len, &wait) ?: + verity_ahash_final(v, req, digest, &wait); + } else { + struct shash_desc *desc = verity_io_hash_req(v, io); -out: + desc->tfm = v->shash_tfm; + r = crypto_shash_import(desc, v->initial_hashstate) ?: + crypto_shash_finup(desc, data, len, digest); + } + if (unlikely(r)) + DMERR("Error hashing block: %d", r); return r; } @@ -325,8 +332,7 @@ static int verity_verify_level(struct dm_verity *v, struct dm_verity_io *io, goto release_ret_r; } - r = verity_hash(v, verity_io_hash_req(v, io), - data, 1 << v->hash_dev_block_bits, + r = verity_hash(v, io, data, 1 << v->hash_dev_block_bits, verity_io_real_digest(v, io), !io->in_bh); if (unlikely(r < 0)) goto release_ret_r; @@ -342,7 +348,7 @@ static int verity_verify_level(struct dm_verity *v, struct dm_verity_io *io, r = -EAGAIN; goto release_ret_r; } else if (verity_fec_decode(v, io, DM_VERITY_BLOCK_TYPE_METADATA, - hash_block, data, NULL) == 0) + hash_block, data) == 0) aux->hash_verified = 1; else if (verity_handle_err(v, DM_VERITY_BLOCK_TYPE_METADATA, @@ -404,98 +410,8 @@ out: return r; } -/* - * Calculates the digest for the given bio - */ -static int verity_for_io_block(struct dm_verity *v, struct dm_verity_io *io, - struct bvec_iter *iter, struct crypto_wait *wait) -{ - unsigned int todo = 1 << v->data_dev_block_bits; - struct bio *bio = dm_bio_from_per_bio_data(io, v->ti->per_io_data_size); - struct scatterlist sg; - struct ahash_request *req = verity_io_hash_req(v, io); - - do { - int r; - unsigned int len; - struct bio_vec bv = bio_iter_iovec(bio, *iter); - - sg_init_table(&sg, 1); - - len = bv.bv_len; - - if (likely(len >= todo)) - len = todo; - /* - * Operating on a single page at a time looks suboptimal - * until you consider the typical block size is 4,096B. - * Going through this loops twice should be very rare. - */ - sg_set_page(&sg, bv.bv_page, len, bv.bv_offset); - ahash_request_set_crypt(req, &sg, NULL, len); - r = crypto_wait_req(crypto_ahash_update(req), wait); - - if (unlikely(r < 0)) { - DMERR("%s crypto op failed: %d", __func__, r); - return r; - } - - bio_advance_iter(bio, iter, len); - todo -= len; - } while (todo); - - return 0; -} - -/* - * Calls function process for 1 << v->data_dev_block_bits bytes in the bio_vec - * starting from iter. - */ -int verity_for_bv_block(struct dm_verity *v, struct dm_verity_io *io, - struct bvec_iter *iter, - int (*process)(struct dm_verity *v, - struct dm_verity_io *io, u8 *data, - size_t len)) -{ - unsigned int todo = 1 << v->data_dev_block_bits; - struct bio *bio = dm_bio_from_per_bio_data(io, v->ti->per_io_data_size); - - do { - int r; - u8 *page; - unsigned int len; - struct bio_vec bv = bio_iter_iovec(bio, *iter); - - page = bvec_kmap_local(&bv); - len = bv.bv_len; - - if (likely(len >= todo)) - len = todo; - - r = process(v, io, page, len); - kunmap_local(page); - - if (r < 0) - return r; - - bio_advance_iter(bio, iter, len); - todo -= len; - } while (todo); - - return 0; -} - -static int verity_recheck_copy(struct dm_verity *v, struct dm_verity_io *io, - u8 *data, size_t len) -{ - memcpy(data, io->recheck_buffer, len); - io->recheck_buffer += len; - - return 0; -} - static noinline int verity_recheck(struct dm_verity *v, struct dm_verity_io *io, - struct bvec_iter start, sector_t cur_block) + sector_t cur_block, u8 *dest) { struct page *page; void *buffer; @@ -518,8 +434,7 @@ static noinline int verity_recheck(struct dm_verity *v, struct dm_verity_io *io, if (unlikely(r)) goto free_ret; - r = verity_hash(v, verity_io_hash_req(v, io), buffer, - 1 << v->data_dev_block_bits, + r = verity_hash(v, io, buffer, 1 << v->data_dev_block_bits, verity_io_real_digest(v, io), true); if (unlikely(r)) goto free_ret; @@ -530,11 +445,7 @@ static noinline int verity_recheck(struct dm_verity *v, struct dm_verity_io *io, goto free_ret; } - io->recheck_buffer = buffer; - r = verity_for_bv_block(v, io, &start, verity_recheck_copy); - if (unlikely(r)) - goto free_ret; - + memcpy(dest, buffer, 1 << v->data_dev_block_bits); r = 0; free_ret: mempool_free(page, &v->recheck_pool); @@ -542,23 +453,36 @@ free_ret: return r; } -static int verity_bv_zero(struct dm_verity *v, struct dm_verity_io *io, - u8 *data, size_t len) -{ - memset(data, 0, len); - return 0; -} - -/* - * Moves the bio iter one data block forward. - */ -static inline void verity_bv_skip_block(struct dm_verity *v, - struct dm_verity_io *io, - struct bvec_iter *iter) +static int verity_handle_data_hash_mismatch(struct dm_verity *v, + struct dm_verity_io *io, + struct bio *bio, sector_t blkno, + u8 *data) { - struct bio *bio = dm_bio_from_per_bio_data(io, v->ti->per_io_data_size); + if (static_branch_unlikely(&use_bh_wq_enabled) && io->in_bh) { + /* + * Error handling code (FEC included) cannot be run in the + * BH workqueue, so fallback to a standard workqueue. + */ + return -EAGAIN; + } + if (verity_recheck(v, io, blkno, data) == 0) { + if (v->validated_blocks) + set_bit(blkno, v->validated_blocks); + return 0; + } +#if defined(CONFIG_DM_VERITY_FEC) + if (verity_fec_decode(v, io, DM_VERITY_BLOCK_TYPE_DATA, blkno, + data) == 0) + return 0; +#endif + if (bio->bi_status) + return -EIO; /* Error correction failed; Just return error */ - bio_advance_iter(bio, iter, 1 << v->data_dev_block_bits); + if (verity_handle_err(v, DM_VERITY_BLOCK_TYPE_DATA, blkno)) { + dm_audit_log_bio(DM_MSG_PREFIX, "verify-data", bio, blkno, 0); + return -EIO; + } + return 0; } /* @@ -566,12 +490,10 @@ static inline void verity_bv_skip_block(struct dm_verity *v, */ static int verity_verify_io(struct dm_verity_io *io) { - bool is_zero; struct dm_verity *v = io->v; - struct bvec_iter start; + const unsigned int block_size = 1 << v->data_dev_block_bits; struct bvec_iter iter_copy; struct bvec_iter *iter; - struct crypto_wait wait; struct bio *bio = dm_bio_from_per_bio_data(io, v->ti->per_io_data_size); unsigned int b; @@ -585,16 +507,17 @@ static int verity_verify_io(struct dm_verity_io *io) } else iter = &io->iter; - for (b = 0; b < io->n_blocks; b++) { + for (b = 0; b < io->n_blocks; + b++, bio_advance_iter(bio, iter, block_size)) { int r; sector_t cur_block = io->block + b; - struct ahash_request *req = verity_io_hash_req(v, io); + bool is_zero; + struct bio_vec bv; + void *data; if (v->validated_blocks && bio->bi_status == BLK_STS_OK && - likely(test_bit(cur_block, v->validated_blocks))) { - verity_bv_skip_block(v, io, iter); + likely(test_bit(cur_block, v->validated_blocks))) continue; - } r = verity_hash_for_block(v, io, cur_block, verity_io_want_digest(v, io), @@ -602,67 +525,49 @@ static int verity_verify_io(struct dm_verity_io *io) if (unlikely(r < 0)) return r; + bv = bio_iter_iovec(bio, *iter); + if (unlikely(bv.bv_len < block_size)) { + /* + * Data block spans pages. This should not happen, + * since dm-verity sets dma_alignment to the data block + * size minus 1, and dm-verity also doesn't allow the + * data block size to be greater than PAGE_SIZE. + */ + DMERR_LIMIT("unaligned io (data block spans pages)"); + return -EIO; + } + + data = bvec_kmap_local(&bv); + if (is_zero) { /* * If we expect a zero block, don't validate, just * return zeros. */ - r = verity_for_bv_block(v, io, iter, - verity_bv_zero); - if (unlikely(r < 0)) - return r; - + memset(data, 0, block_size); + kunmap_local(data); continue; } - r = verity_hash_init(v, req, &wait, !io->in_bh); - if (unlikely(r < 0)) - return r; - - start = *iter; - r = verity_for_io_block(v, io, iter, &wait); - if (unlikely(r < 0)) - return r; - - r = verity_hash_final(v, req, verity_io_real_digest(v, io), - &wait); - if (unlikely(r < 0)) + r = verity_hash(v, io, data, block_size, + verity_io_real_digest(v, io), !io->in_bh); + if (unlikely(r < 0)) { + kunmap_local(data); return r; + } if (likely(memcmp(verity_io_real_digest(v, io), verity_io_want_digest(v, io), v->digest_size) == 0)) { if (v->validated_blocks) set_bit(cur_block, v->validated_blocks); + kunmap_local(data); continue; - } else if (static_branch_unlikely(&use_bh_wq_enabled) && io->in_bh) { - /* - * Error handling code (FEC included) cannot be run in a - * tasklet since it may sleep, so fallback to work-queue. - */ - return -EAGAIN; - } else if (verity_recheck(v, io, start, cur_block) == 0) { - if (v->validated_blocks) - set_bit(cur_block, v->validated_blocks); - continue; -#if defined(CONFIG_DM_VERITY_FEC) - } else if (verity_fec_decode(v, io, DM_VERITY_BLOCK_TYPE_DATA, - cur_block, NULL, &start) == 0) { - continue; -#endif - } else { - if (bio->bi_status) { - /* - * Error correction failed; Just return error - */ - return -EIO; - } - if (verity_handle_err(v, DM_VERITY_BLOCK_TYPE_DATA, - cur_block)) { - dm_audit_log_bio(DM_MSG_PREFIX, "verify-data", - bio, cur_block, 0); - return -EIO; - } } + r = verity_handle_data_hash_mismatch(v, io, bio, cur_block, + data); + kunmap_local(data); + if (unlikely(r)) + return r; } return 0; @@ -1014,7 +919,15 @@ static void verity_io_hints(struct dm_target *ti, struct queue_limits *limits) if (limits->physical_block_size < 1 << v->data_dev_block_bits) limits->physical_block_size = 1 << v->data_dev_block_bits; - blk_limits_io_min(limits, limits->logical_block_size); + limits->io_min = limits->logical_block_size; + + /* + * Similar to what dm-crypt does, opt dm-verity out of support for + * direct I/O that is aligned to less than the traditional direct I/O + * alignment requirement of logical_block_size. This prevents dm-verity + * data blocks from crossing pages, eliminating various edge cases. + */ + limits->dma_alignment = limits->logical_block_size - 1; } static void verity_dtr(struct dm_target *ti) @@ -1033,11 +946,16 @@ static void verity_dtr(struct dm_target *ti) kvfree(v->validated_blocks); kfree(v->salt); + kfree(v->initial_hashstate); kfree(v->root_digest); kfree(v->zero_digest); - if (v->tfm) - crypto_free_ahash(v->tfm); + if (v->ahash_tfm) { + static_branch_dec(&ahash_enabled); + crypto_free_ahash(v->ahash_tfm); + } else { + crypto_free_shash(v->shash_tfm); + } kfree(v->alg_name); @@ -1083,7 +1001,7 @@ static int verity_alloc_most_once(struct dm_verity *v) static int verity_alloc_zero_digest(struct dm_verity *v) { int r = -ENOMEM; - struct ahash_request *req; + struct dm_verity_io *io; u8 *zero_data; v->zero_digest = kmalloc(v->digest_size, GFP_KERNEL); @@ -1091,9 +1009,9 @@ static int verity_alloc_zero_digest(struct dm_verity *v) if (!v->zero_digest) return r; - req = kmalloc(v->ahash_reqsize, GFP_KERNEL); + io = kmalloc(sizeof(*io) + v->hash_reqsize, GFP_KERNEL); - if (!req) + if (!io) return r; /* verity_dtr will free zero_digest */ zero_data = kzalloc(1 << v->data_dev_block_bits, GFP_KERNEL); @@ -1101,11 +1019,11 @@ static int verity_alloc_zero_digest(struct dm_verity *v) if (!zero_data) goto out; - r = verity_hash(v, req, zero_data, 1 << v->data_dev_block_bits, + r = verity_hash(v, io, zero_data, 1 << v->data_dev_block_bits, v->zero_digest, true); out: - kfree(req); + kfree(io); kfree(zero_data); return r; @@ -1226,6 +1144,113 @@ static int verity_parse_opt_args(struct dm_arg_set *as, struct dm_verity *v, return r; } +static int verity_setup_hash_alg(struct dm_verity *v, const char *alg_name) +{ + struct dm_target *ti = v->ti; + struct crypto_ahash *ahash; + struct crypto_shash *shash = NULL; + const char *driver_name; + + v->alg_name = kstrdup(alg_name, GFP_KERNEL); + if (!v->alg_name) { + ti->error = "Cannot allocate algorithm name"; + return -ENOMEM; + } + + /* + * Allocate the hash transformation object that this dm-verity instance + * will use. The vast majority of dm-verity users use CPU-based + * hashing, so when possible use the shash API to minimize the crypto + * API overhead. If the ahash API resolves to a different driver + * (likely an off-CPU hardware offload), use ahash instead. Also use + * ahash if the obsolete dm-verity format with the appended salt is + * being used, so that quirk only needs to be handled in one place. + */ + ahash = crypto_alloc_ahash(alg_name, 0, + v->use_bh_wq ? CRYPTO_ALG_ASYNC : 0); + if (IS_ERR(ahash)) { + ti->error = "Cannot initialize hash function"; + return PTR_ERR(ahash); + } + driver_name = crypto_ahash_driver_name(ahash); + if (v->version >= 1 /* salt prepended, not appended? */) { + shash = crypto_alloc_shash(alg_name, 0, 0); + if (!IS_ERR(shash) && + strcmp(crypto_shash_driver_name(shash), driver_name) != 0) { + /* + * ahash gave a different driver than shash, so probably + * this is a case of real hardware offload. Use ahash. + */ + crypto_free_shash(shash); + shash = NULL; + } + } + if (!IS_ERR_OR_NULL(shash)) { + crypto_free_ahash(ahash); + ahash = NULL; + v->shash_tfm = shash; + v->digest_size = crypto_shash_digestsize(shash); + v->hash_reqsize = sizeof(struct shash_desc) + + crypto_shash_descsize(shash); + DMINFO("%s using shash \"%s\"", alg_name, driver_name); + } else { + v->ahash_tfm = ahash; + static_branch_inc(&ahash_enabled); + v->digest_size = crypto_ahash_digestsize(ahash); + v->hash_reqsize = sizeof(struct ahash_request) + + crypto_ahash_reqsize(ahash); + DMINFO("%s using ahash \"%s\"", alg_name, driver_name); + } + if ((1 << v->hash_dev_block_bits) < v->digest_size * 2) { + ti->error = "Digest size too big"; + return -EINVAL; + } + return 0; +} + +static int verity_setup_salt_and_hashstate(struct dm_verity *v, const char *arg) +{ + struct dm_target *ti = v->ti; + + if (strcmp(arg, "-") != 0) { + v->salt_size = strlen(arg) / 2; + v->salt = kmalloc(v->salt_size, GFP_KERNEL); + if (!v->salt) { + ti->error = "Cannot allocate salt"; + return -ENOMEM; + } + if (strlen(arg) != v->salt_size * 2 || + hex2bin(v->salt, arg, v->salt_size)) { + ti->error = "Invalid salt"; + return -EINVAL; + } + } + if (v->shash_tfm) { + SHASH_DESC_ON_STACK(desc, v->shash_tfm); + int r; + + /* + * Compute the pre-salted hash state that can be passed to + * crypto_shash_import() for each block later. + */ + v->initial_hashstate = kmalloc( + crypto_shash_statesize(v->shash_tfm), GFP_KERNEL); + if (!v->initial_hashstate) { + ti->error = "Cannot allocate initial hash state"; + return -ENOMEM; + } + desc->tfm = v->shash_tfm; + r = crypto_shash_init(desc) ?: + crypto_shash_update(desc, v->salt, v->salt_size) ?: + crypto_shash_export(desc, v->initial_hashstate); + if (r) { + ti->error = "Cannot set up initial hash state"; + return r; + } + } + return 0; +} + /* * Target parameters: * <version> The current format is version 1. @@ -1350,38 +1375,9 @@ static int verity_ctr(struct dm_target *ti, unsigned int argc, char **argv) } v->hash_start = num_ll; - v->alg_name = kstrdup(argv[7], GFP_KERNEL); - if (!v->alg_name) { - ti->error = "Cannot allocate algorithm name"; - r = -ENOMEM; - goto bad; - } - - v->tfm = crypto_alloc_ahash(v->alg_name, 0, - v->use_bh_wq ? CRYPTO_ALG_ASYNC : 0); - if (IS_ERR(v->tfm)) { - ti->error = "Cannot initialize hash function"; - r = PTR_ERR(v->tfm); - v->tfm = NULL; - goto bad; - } - - /* - * dm-verity performance can vary greatly depending on which hash - * algorithm implementation is used. Help people debug performance - * problems by logging the ->cra_driver_name. - */ - DMINFO("%s using implementation \"%s\"", v->alg_name, - crypto_hash_alg_common(v->tfm)->base.cra_driver_name); - - v->digest_size = crypto_ahash_digestsize(v->tfm); - if ((1 << v->hash_dev_block_bits) < v->digest_size * 2) { - ti->error = "Digest size too big"; - r = -EINVAL; + r = verity_setup_hash_alg(v, argv[7]); + if (r) goto bad; - } - v->ahash_reqsize = sizeof(struct ahash_request) + - crypto_ahash_reqsize(v->tfm); v->root_digest = kmalloc(v->digest_size, GFP_KERNEL); if (!v->root_digest) { @@ -1397,21 +1393,9 @@ static int verity_ctr(struct dm_target *ti, unsigned int argc, char **argv) } root_hash_digest_to_validate = argv[8]; - if (strcmp(argv[9], "-")) { - v->salt_size = strlen(argv[9]) / 2; - v->salt = kmalloc(v->salt_size, GFP_KERNEL); - if (!v->salt) { - ti->error = "Cannot allocate salt"; - r = -ENOMEM; - goto bad; - } - if (strlen(argv[9]) != v->salt_size * 2 || - hex2bin(v->salt, argv[9], v->salt_size)) { - ti->error = "Invalid salt"; - r = -EINVAL; - goto bad; - } - } + r = verity_setup_salt_and_hashstate(v, argv[9]); + if (r) + goto bad; argv += 10; argc -= 10; @@ -1513,8 +1497,7 @@ static int verity_ctr(struct dm_target *ti, unsigned int argc, char **argv) goto bad; } - ti->per_io_data_size = sizeof(struct dm_verity_io) + - v->ahash_reqsize + v->digest_size * 2; + ti->per_io_data_size = sizeof(struct dm_verity_io) + v->hash_reqsize; r = verity_fec_ctr(v); if (r) @@ -1539,14 +1522,6 @@ bad: } /* - * Check whether a DM target is a verity target. - */ -bool dm_is_verity_target(struct dm_target *ti) -{ - return ti->type->module == THIS_MODULE; -} - -/* * Get the verity mode (error behavior) of a verity target. * * Returns the verity mode of the target, or -EINVAL if 'ti' is not a verity @@ -1599,6 +1574,14 @@ static struct target_type verity_target = { }; module_dm(verity); +/* + * Check whether a DM target is a verity target. + */ +bool dm_is_verity_target(struct dm_target *ti) +{ + return ti->type == &verity_target; +} + MODULE_AUTHOR("Mikulas Patocka <[email protected]>"); MODULE_AUTHOR("Mandeep Baines <[email protected]>"); MODULE_AUTHOR("Will Drewry <[email protected]>"); diff --git a/drivers/md/dm-verity-verify-sig.c b/drivers/md/dm-verity-verify-sig.c index 4836508ea50c..d351d7d39c60 100644 --- a/drivers/md/dm-verity-verify-sig.c +++ b/drivers/md/dm-verity-verify-sig.c @@ -126,6 +126,13 @@ int verity_verify_root_hash(const void *root_hash, size_t root_hash_len, NULL, #endif VERIFYING_UNSPECIFIED_SIGNATURE, NULL, NULL); +#ifdef CONFIG_DM_VERITY_VERIFY_ROOTHASH_SIG_PLATFORM_KEYRING + if (ret == -ENOKEY) + ret = verify_pkcs7_signature(root_hash, root_hash_len, sig_data, + sig_len, + VERIFY_USE_PLATFORM_KEYRING, + VERIFYING_UNSPECIFIED_SIGNATURE, NULL, NULL); +#endif return ret; } diff --git a/drivers/md/dm-verity.h b/drivers/md/dm-verity.h index 20b1bcf03474..aac3a1b1d94a 100644 --- a/drivers/md/dm-verity.h +++ b/drivers/md/dm-verity.h @@ -39,9 +39,11 @@ struct dm_verity { struct dm_target *ti; struct dm_bufio_client *bufio; char *alg_name; - struct crypto_ahash *tfm; + struct crypto_ahash *ahash_tfm; /* either this or shash_tfm is set */ + struct crypto_shash *shash_tfm; /* either this or ahash_tfm is set */ u8 *root_digest; /* digest of the root block */ u8 *salt; /* salt: its size is salt_size */ + u8 *initial_hashstate; /* salted initial state, if shash_tfm is set */ u8 *zero_digest; /* digest for a zero block */ unsigned int salt_size; sector_t data_start; /* data offset in 512-byte sectors */ @@ -56,7 +58,7 @@ struct dm_verity { bool hash_failed:1; /* set if hash of any block failed */ bool use_bh_wq:1; /* try to verify in BH wq before normal work-queue */ unsigned int digest_size; /* digest size for the current hash algorithm */ - unsigned int ahash_reqsize;/* the size of temporary space for crypto */ + unsigned int hash_reqsize; /* the size of temporary space for crypto */ enum verity_mode mode; /* mode for handling verification errors */ unsigned int corrupted_errs;/* Number of errors for corrupted blocks */ @@ -89,45 +91,36 @@ struct dm_verity_io { struct work_struct work; struct work_struct bh_work; - char *recheck_buffer; + u8 real_digest[HASH_MAX_DIGESTSIZE]; + u8 want_digest[HASH_MAX_DIGESTSIZE]; /* - * Three variably-size fields follow this struct: - * - * u8 hash_req[v->ahash_reqsize]; - * u8 real_digest[v->digest_size]; - * u8 want_digest[v->digest_size]; - * - * To access them use: verity_io_hash_req(), verity_io_real_digest() - * and verity_io_want_digest(). + * This struct is followed by a variable-sized hash request of size + * v->hash_reqsize, either a struct ahash_request or a struct shash_desc + * (depending on whether ahash_tfm or shash_tfm is being used). To + * access it, use verity_io_hash_req(). */ }; -static inline struct ahash_request *verity_io_hash_req(struct dm_verity *v, - struct dm_verity_io *io) +static inline void *verity_io_hash_req(struct dm_verity *v, + struct dm_verity_io *io) { - return (struct ahash_request *)(io + 1); + return io + 1; } static inline u8 *verity_io_real_digest(struct dm_verity *v, struct dm_verity_io *io) { - return (u8 *)(io + 1) + v->ahash_reqsize; + return io->real_digest; } static inline u8 *verity_io_want_digest(struct dm_verity *v, struct dm_verity_io *io) { - return (u8 *)(io + 1) + v->ahash_reqsize + v->digest_size; + return io->want_digest; } -extern int verity_for_bv_block(struct dm_verity *v, struct dm_verity_io *io, - struct bvec_iter *iter, - int (*process)(struct dm_verity *v, - struct dm_verity_io *io, - u8 *data, size_t len)); - -extern int verity_hash(struct dm_verity *v, struct ahash_request *req, +extern int verity_hash(struct dm_verity *v, struct dm_verity_io *io, const u8 *data, size_t len, u8 *digest, bool may_sleep); extern int verity_hash_for_block(struct dm_verity *v, struct dm_verity_io *io, diff --git a/drivers/md/dm-zone.c b/drivers/md/dm-zone.c index 5d66d916730e..c0d41c36e06e 100644 --- a/drivers/md/dm-zone.c +++ b/drivers/md/dm-zone.c @@ -13,8 +13,6 @@ #define DM_MSG_PREFIX "zone" -#define DM_ZONE_INVALID_WP_OFST UINT_MAX - /* * For internal zone reports bypassing the top BIO submission path. */ @@ -146,34 +144,27 @@ bool dm_is_zone_write(struct mapped_device *md, struct bio *bio) } /* - * Count conventional zones of a mapped zoned device. If the device - * only has conventional zones, do not expose it as zoned. - */ -static int dm_check_zoned_cb(struct blk_zone *zone, unsigned int idx, - void *data) -{ - unsigned int *nr_conv_zones = data; - - if (zone->type == BLK_ZONE_TYPE_CONVENTIONAL) - (*nr_conv_zones)++; - - return 0; -} - -/* * Revalidate the zones of a mapped device to initialize resource necessary * for zone append emulation. Note that we cannot simply use the block layer * blk_revalidate_disk_zones() function here as the mapped device is suspended * (this is called from __bind() context). */ -static int dm_revalidate_zones(struct mapped_device *md, struct dm_table *t) +int dm_revalidate_zones(struct dm_table *t, struct request_queue *q) { + struct mapped_device *md = t->md; struct gendisk *disk = md->disk; int ret; + if (!get_capacity(disk)) + return 0; + /* Revalidate only if something changed. */ - if (!disk->nr_zones || disk->nr_zones != md->nr_zones) + if (!disk->nr_zones || disk->nr_zones != md->nr_zones) { + DMINFO("%s using %s zone append", + disk->disk_name, + queue_emulates_zone_append(q) ? "emulated" : "native"); md->nr_zones = 0; + } if (md->nr_zones) return 0; @@ -220,13 +211,129 @@ static bool dm_table_supports_zone_append(struct dm_table *t) return true; } +struct dm_device_zone_count { + sector_t start; + sector_t len; + unsigned int total_nr_seq_zones; + unsigned int target_nr_seq_zones; +}; + +/* + * Count the total number of and the number of mapped sequential zones of a + * target zoned device. + */ +static int dm_device_count_zones_cb(struct blk_zone *zone, + unsigned int idx, void *data) +{ + struct dm_device_zone_count *zc = data; + + if (zone->type != BLK_ZONE_TYPE_CONVENTIONAL) { + zc->total_nr_seq_zones++; + if (zone->start >= zc->start && + zone->start < zc->start + zc->len) + zc->target_nr_seq_zones++; + } + + return 0; +} + +static int dm_device_count_zones(struct dm_dev *dev, + struct dm_device_zone_count *zc) +{ + int ret; + + ret = blkdev_report_zones(dev->bdev, 0, BLK_ALL_ZONES, + dm_device_count_zones_cb, zc); + if (ret < 0) + return ret; + if (!ret) + return -EIO; + return 0; +} + +struct dm_zone_resource_limits { + unsigned int mapped_nr_seq_zones; + struct queue_limits *lim; + bool reliable_limits; +}; + +static int device_get_zone_resource_limits(struct dm_target *ti, + struct dm_dev *dev, sector_t start, + sector_t len, void *data) +{ + struct dm_zone_resource_limits *zlim = data; + struct gendisk *disk = dev->bdev->bd_disk; + unsigned int max_open_zones, max_active_zones; + int ret; + struct dm_device_zone_count zc = { + .start = start, + .len = len, + }; + + /* + * If the target is not the whole device, the device zone resources may + * be shared between different targets. Check this by counting the + * number of mapped sequential zones: if this number is smaller than the + * total number of sequential zones of the target device, then resource + * sharing may happen and the zone limits will not be reliable. + */ + ret = dm_device_count_zones(dev, &zc); + if (ret) { + DMERR("Count %s zones failed %d", disk->disk_name, ret); + return ret; + } + + /* + * If the target does not map any sequential zones, then we do not need + * any zone resource limits. + */ + if (!zc.target_nr_seq_zones) + return 0; + + /* + * If the target does not map all sequential zones, the limits + * will not be reliable and we cannot use REQ_OP_ZONE_RESET_ALL. + */ + if (zc.target_nr_seq_zones < zc.total_nr_seq_zones) { + zlim->reliable_limits = false; + ti->zone_reset_all_supported = false; + } + + /* + * If the target maps less sequential zones than the limit values, then + * we do not have limits for this target. + */ + max_active_zones = disk->queue->limits.max_active_zones; + if (max_active_zones >= zc.target_nr_seq_zones) + max_active_zones = 0; + zlim->lim->max_active_zones = + min_not_zero(max_active_zones, zlim->lim->max_active_zones); + + max_open_zones = disk->queue->limits.max_open_zones; + if (max_open_zones >= zc.target_nr_seq_zones) + max_open_zones = 0; + zlim->lim->max_open_zones = + min_not_zero(max_open_zones, zlim->lim->max_open_zones); + + /* + * Also count the total number of sequential zones for the mapped + * device so that when we are done inspecting all its targets, we are + * able to check if the mapped device actually has any sequential zones. + */ + zlim->mapped_nr_seq_zones += zc.target_nr_seq_zones; + + return 0; +} + int dm_set_zones_restrictions(struct dm_table *t, struct request_queue *q, struct queue_limits *lim) { struct mapped_device *md = t->md; struct gendisk *disk = md->disk; - unsigned int nr_conv_zones = 0; - int ret; + struct dm_zone_resource_limits zlim = { + .reliable_limits = true, + .lim = lim, + }; /* * Check if zone append is natively supported, and if not, set the @@ -240,46 +347,63 @@ int dm_set_zones_restrictions(struct dm_table *t, struct request_queue *q, lim->max_zone_append_sectors = 0; } - if (!get_capacity(md->disk)) - return 0; - /* - * Count conventional zones to check that the mapped device will indeed - * have sequential write required zones. + * Determine the max open and max active zone limits for the mapped + * device by inspecting the zone resource limits and the zones mapped + * by each target. */ - md->zone_revalidate_map = t; - ret = dm_blk_report_zones(disk, 0, UINT_MAX, - dm_check_zoned_cb, &nr_conv_zones); - md->zone_revalidate_map = NULL; - if (ret < 0) { - DMERR("Check zoned failed %d", ret); - return ret; + for (unsigned int i = 0; i < t->num_targets; i++) { + struct dm_target *ti = dm_table_get_target(t, i); + + /* + * Assume that the target can accept REQ_OP_ZONE_RESET_ALL. + * device_get_zone_resource_limits() may adjust this if one of + * the device used by the target does not have all its + * sequential write required zones mapped. + */ + ti->zone_reset_all_supported = true; + + if (!ti->type->iterate_devices || + ti->type->iterate_devices(ti, + device_get_zone_resource_limits, &zlim)) { + DMERR("Could not determine %s zone resource limits", + disk->disk_name); + return -ENODEV; + } } /* - * If we only have conventional zones, expose the mapped device as - * a regular device. + * If we only have conventional zones mapped, expose the mapped device + + as a regular device. */ - if (nr_conv_zones >= ret) { + if (!zlim.mapped_nr_seq_zones) { lim->max_open_zones = 0; lim->max_active_zones = 0; - lim->zoned = false; + lim->max_zone_append_sectors = 0; + lim->zone_write_granularity = 0; + lim->chunk_sectors = 0; + lim->features &= ~BLK_FEAT_ZONED; clear_bit(DMF_EMULATE_ZONE_APPEND, &md->flags); + md->nr_zones = 0; disk->nr_zones = 0; return 0; } - if (!md->disk->nr_zones) { - DMINFO("%s using %s zone append", - md->disk->disk_name, - queue_emulates_zone_append(q) ? "emulated" : "native"); - } - - ret = dm_revalidate_zones(md, t); - if (ret < 0) - return ret; + /* + * Warn once (when the capacity is not yet set) if the mapped device is + * partially using zone resources of the target devices as that leads to + * unreliable limits, i.e. if another mapped device uses the same + * underlying devices, we cannot enforce zone limits to guarantee that + * writing will not lead to errors. Note that we really should return + * an error for such case but there is no easy way to find out if + * another mapped device uses the same underlying zoned devices. + */ + if (!get_capacity(disk) && !zlim.reliable_limits) + DMWARN("%s zone resource limits may be unreliable", + disk->disk_name); - if (!static_key_enabled(&zoned_enabled.key)) + if (lim->features & BLK_FEAT_ZONED && + !static_key_enabled(&zoned_enabled.key)) static_branch_enable(&zoned_enabled); return 0; } @@ -306,3 +430,39 @@ void dm_zone_endio(struct dm_io *io, struct bio *clone) return; } + +static int dm_zone_need_reset_cb(struct blk_zone *zone, unsigned int idx, + void *data) +{ + /* + * For an all-zones reset, ignore conventional, empty, read-only + * and offline zones. + */ + switch (zone->cond) { + case BLK_ZONE_COND_NOT_WP: + case BLK_ZONE_COND_EMPTY: + case BLK_ZONE_COND_READONLY: + case BLK_ZONE_COND_OFFLINE: + return 0; + default: + set_bit(idx, (unsigned long *)data); + return 0; + } +} + +int dm_zone_get_reset_bitmap(struct mapped_device *md, struct dm_table *t, + sector_t sector, unsigned int nr_zones, + unsigned long *need_reset) +{ + int ret; + + ret = dm_blk_do_report_zones(md, t, sector, nr_zones, + dm_zone_need_reset_cb, need_reset); + if (ret != nr_zones) { + DMERR("Get %s zone reset bitmap failed\n", + md->disk->disk_name); + return -EIO; + } + + return 0; +} diff --git a/drivers/md/dm-zoned-target.c b/drivers/md/dm-zoned-target.c index 12236e6f46f3..6141fc25d842 100644 --- a/drivers/md/dm-zoned-target.c +++ b/drivers/md/dm-zoned-target.c @@ -996,8 +996,8 @@ static void dmz_io_hints(struct dm_target *ti, struct queue_limits *limits) limits->logical_block_size = DMZ_BLOCK_SIZE; limits->physical_block_size = DMZ_BLOCK_SIZE; - blk_limits_io_min(limits, DMZ_BLOCK_SIZE); - blk_limits_io_opt(limits, DMZ_BLOCK_SIZE); + limits->io_min = DMZ_BLOCK_SIZE; + limits->io_opt = DMZ_BLOCK_SIZE; limits->discard_alignment = 0; limits->discard_granularity = DMZ_BLOCK_SIZE; @@ -1009,7 +1009,7 @@ static void dmz_io_hints(struct dm_target *ti, struct queue_limits *limits) limits->max_sectors = chunk_sectors; /* We are exposing a drive-managed zoned block device */ - limits->zoned = false; + limits->features &= ~BLK_FEAT_ZONED; } /* diff --git a/drivers/md/dm.c b/drivers/md/dm.c index 13037d6a6f62..97fab2087df8 100644 --- a/drivers/md/dm.c +++ b/drivers/md/dm.c @@ -11,6 +11,7 @@ #include "dm-uevent.h" #include "dm-ima.h" +#include <linux/bio-integrity.h> #include <linux/init.h> #include <linux/module.h> #include <linux/mutex.h> @@ -645,7 +646,7 @@ static struct bio *alloc_tio(struct clone_info *ci, struct dm_target *ti, /* Set default bdev, but target must bio_set_dev() before issuing IO */ clone->bi_bdev = md->disk->part0; - if (unlikely(ti->needs_bio_set_dev)) + if (likely(ti != NULL) && unlikely(ti->needs_bio_set_dev)) bio_set_dev(clone, md->disk->part0); if (len) { @@ -1107,7 +1108,7 @@ static void clone_endio(struct bio *bio) blk_status_t error = bio->bi_status; struct dm_target_io *tio = clone_to_tio(bio); struct dm_target *ti = tio->ti; - dm_endio_fn endio = ti->type->end_io; + dm_endio_fn endio = likely(ti != NULL) ? ti->type->end_io : NULL; struct dm_io *io = tio->io; struct mapped_device *md = io->md; @@ -1154,7 +1155,7 @@ static void clone_endio(struct bio *bio) } if (static_branch_unlikely(&swap_bios_enabled) && - unlikely(swap_bios_limit(ti, bio))) + likely(ti != NULL) && unlikely(swap_bios_limit(ti, bio))) up(&md->swap_bios_semaphore); free_tio(bio); @@ -1188,7 +1189,7 @@ static sector_t __max_io_len(struct dm_target *ti, sector_t sector, return len; return min_t(sector_t, len, min(max_sectors ? : queue_max_sectors(ti->table->md->queue), - blk_chunk_sectors_left(target_offset, max_granularity))); + blk_boundary_sectors_left(target_offset, max_granularity))); } static inline sector_t max_io_len(struct dm_target *ti, sector_t sector) @@ -1553,17 +1554,43 @@ static void __send_empty_flush(struct clone_info *ci) ci->sector_count = 0; ci->io->tio.clone.bi_iter.bi_size = 0; - for (unsigned int i = 0; i < t->num_targets; i++) { - unsigned int bios; - struct dm_target *ti = dm_table_get_target(t, i); + if (!t->flush_bypasses_map) { + for (unsigned int i = 0; i < t->num_targets; i++) { + unsigned int bios; + struct dm_target *ti = dm_table_get_target(t, i); - if (unlikely(ti->num_flush_bios == 0)) - continue; + if (unlikely(ti->num_flush_bios == 0)) + continue; - atomic_add(ti->num_flush_bios, &ci->io->io_count); - bios = __send_duplicate_bios(ci, ti, ti->num_flush_bios, - NULL, GFP_NOWAIT); - atomic_sub(ti->num_flush_bios - bios, &ci->io->io_count); + atomic_add(ti->num_flush_bios, &ci->io->io_count); + bios = __send_duplicate_bios(ci, ti, ti->num_flush_bios, + NULL, GFP_NOWAIT); + atomic_sub(ti->num_flush_bios - bios, &ci->io->io_count); + } + } else { + /* + * Note that there's no need to grab t->devices_lock here + * because the targets that support flush optimization don't + * modify the list of devices. + */ + struct list_head *devices = dm_table_get_devices(t); + unsigned int len = 0; + struct dm_dev_internal *dd; + list_for_each_entry(dd, devices, list) { + struct bio *clone; + /* + * Note that the structure dm_target_io is not + * associated with any target (because the device may be + * used by multiple targets), so we set tio->ti = NULL. + * We must check for NULL in the I/O processing path, to + * avoid NULL pointer dereference. + */ + clone = alloc_tio(ci, NULL, 0, &len, GFP_NOIO); + atomic_add(1, &ci->io->io_count); + bio_set_dev(clone, dd->dm_dev->bdev); + clone->bi_end_io = clone_endio; + dm_submit_bio_remap(clone, NULL); + } } /* @@ -1598,20 +1625,19 @@ static void __send_abnormal_io(struct clone_info *ci, struct dm_target *ti, static bool is_abnormal_io(struct bio *bio) { - enum req_op op = bio_op(bio); - - if (op != REQ_OP_READ && op != REQ_OP_WRITE && op != REQ_OP_FLUSH) { - switch (op) { - case REQ_OP_DISCARD: - case REQ_OP_SECURE_ERASE: - case REQ_OP_WRITE_ZEROES: - return true; - default: - break; - } + switch (bio_op(bio)) { + case REQ_OP_READ: + case REQ_OP_WRITE: + case REQ_OP_FLUSH: + return false; + case REQ_OP_DISCARD: + case REQ_OP_SECURE_ERASE: + case REQ_OP_WRITE_ZEROES: + case REQ_OP_ZONE_RESET_ALL: + return true; + default: + return false; } - - return false; } static blk_status_t __process_abnormal_io(struct clone_info *ci, @@ -1632,14 +1658,10 @@ static blk_status_t __process_abnormal_io(struct clone_info *ci, case REQ_OP_SECURE_ERASE: num_bios = ti->num_secure_erase_bios; max_sectors = limits->max_secure_erase_sectors; - if (ti->max_secure_erase_granularity) - max_granularity = max_sectors; break; case REQ_OP_WRITE_ZEROES: num_bios = ti->num_write_zeroes_bios; max_sectors = limits->max_write_zeroes_sectors; - if (ti->max_write_zeroes_granularity) - max_granularity = max_sectors; break; default: break; @@ -1776,6 +1798,119 @@ static inline bool dm_zone_plug_bio(struct mapped_device *md, struct bio *bio) { return dm_emulate_zone_append(md) && blk_zone_plug_bio(bio, 0); } + +static blk_status_t __send_zone_reset_all_emulated(struct clone_info *ci, + struct dm_target *ti) +{ + struct bio_list blist = BIO_EMPTY_LIST; + struct mapped_device *md = ci->io->md; + unsigned int zone_sectors = md->disk->queue->limits.chunk_sectors; + unsigned long *need_reset; + unsigned int i, nr_zones, nr_reset; + unsigned int num_bios = 0; + blk_status_t sts = BLK_STS_OK; + sector_t sector = ti->begin; + struct bio *clone; + int ret; + + nr_zones = ti->len >> ilog2(zone_sectors); + need_reset = bitmap_zalloc(nr_zones, GFP_NOIO); + if (!need_reset) + return BLK_STS_RESOURCE; + + ret = dm_zone_get_reset_bitmap(md, ci->map, ti->begin, + nr_zones, need_reset); + if (ret) { + sts = BLK_STS_IOERR; + goto free_bitmap; + } + + /* If we have no zone to reset, we are done. */ + nr_reset = bitmap_weight(need_reset, nr_zones); + if (!nr_reset) + goto free_bitmap; + + atomic_add(nr_zones, &ci->io->io_count); + + for (i = 0; i < nr_zones; i++) { + + if (!test_bit(i, need_reset)) { + sector += zone_sectors; + continue; + } + + if (bio_list_empty(&blist)) { + /* This may take a while, so be nice to others */ + if (num_bios) + cond_resched(); + + /* + * We may need to reset thousands of zones, so let's + * not go crazy with the clone allocation. + */ + alloc_multiple_bios(&blist, ci, ti, min(nr_reset, 32), + NULL, GFP_NOIO); + } + + /* Get a clone and change it to a regular reset operation. */ + clone = bio_list_pop(&blist); + clone->bi_opf &= ~REQ_OP_MASK; + clone->bi_opf |= REQ_OP_ZONE_RESET | REQ_SYNC; + clone->bi_iter.bi_sector = sector; + clone->bi_iter.bi_size = 0; + __map_bio(clone); + + sector += zone_sectors; + num_bios++; + nr_reset--; + } + + WARN_ON_ONCE(!bio_list_empty(&blist)); + atomic_sub(nr_zones - num_bios, &ci->io->io_count); + ci->sector_count = 0; + +free_bitmap: + bitmap_free(need_reset); + + return sts; +} + +static void __send_zone_reset_all_native(struct clone_info *ci, + struct dm_target *ti) +{ + unsigned int bios; + + atomic_add(1, &ci->io->io_count); + bios = __send_duplicate_bios(ci, ti, 1, NULL, GFP_NOIO); + atomic_sub(1 - bios, &ci->io->io_count); + + ci->sector_count = 0; +} + +static blk_status_t __send_zone_reset_all(struct clone_info *ci) +{ + struct dm_table *t = ci->map; + blk_status_t sts = BLK_STS_OK; + + for (unsigned int i = 0; i < t->num_targets; i++) { + struct dm_target *ti = dm_table_get_target(t, i); + + if (ti->zone_reset_all_supported) { + __send_zone_reset_all_native(ci, ti); + continue; + } + + sts = __send_zone_reset_all_emulated(ci, ti); + if (sts != BLK_STS_OK) + break; + } + + /* Release the reference that alloc_io() took for submission. */ + atomic_sub(1, &ci->io->io_count); + + return sts; +} + #else static inline bool dm_zone_bio_needs_split(struct mapped_device *md, struct bio *bio) @@ -1786,6 +1921,10 @@ static inline bool dm_zone_plug_bio(struct mapped_device *md, struct bio *bio) { return false; } +static blk_status_t __send_zone_reset_all(struct clone_info *ci) +{ + return BLK_STS_NOTSUPP; +} #endif /* @@ -1799,9 +1938,14 @@ static void dm_split_and_process_bio(struct mapped_device *md, blk_status_t error = BLK_STS_OK; bool is_abnormal, need_split; - need_split = is_abnormal = is_abnormal_io(bio); - if (static_branch_unlikely(&zoned_enabled)) - need_split = is_abnormal || dm_zone_bio_needs_split(md, bio); + is_abnormal = is_abnormal_io(bio); + if (static_branch_unlikely(&zoned_enabled)) { + /* Special case REQ_OP_ZONE_RESET_ALL as it cannot be split. */ + need_split = (bio_op(bio) != REQ_OP_ZONE_RESET_ALL) && + (is_abnormal || dm_zone_bio_needs_split(md, bio)); + } else { + need_split = is_abnormal; + } if (unlikely(need_split)) { /* @@ -1842,6 +1986,12 @@ static void dm_split_and_process_bio(struct mapped_device *md, goto out; } + if (static_branch_unlikely(&zoned_enabled) && + (bio_op(bio) == REQ_OP_ZONE_RESET_ALL)) { + error = __send_zone_reset_all(&ci); + goto out; + } + error = __split_and_process_bio(&ci); if (error || !ci.sector_count) goto out; @@ -2386,22 +2536,15 @@ int dm_setup_md_queue(struct mapped_device *md, struct dm_table *t) struct table_device *td; int r; - switch (type) { - case DM_TYPE_REQUEST_BASED: + WARN_ON_ONCE(type == DM_TYPE_NONE); + + if (type == DM_TYPE_REQUEST_BASED) { md->disk->fops = &dm_rq_blk_dops; r = dm_mq_init_request_queue(md, t); if (r) { DMERR("Cannot initialize queue for request-based dm mapped device"); return r; } - break; - case DM_TYPE_BIO_BASED: - case DM_TYPE_DAX_BIO_BASED: - blk_queue_flag_set(QUEUE_FLAG_IO_STAT, md->queue); - break; - case DM_TYPE_NONE: - WARN_ON_ONCE(true); - break; } r = dm_calculate_queue_limits(t, &limits); diff --git a/drivers/md/dm.h b/drivers/md/dm.h index 53ef8207fe2c..cc466ad5cb1d 100644 --- a/drivers/md/dm.h +++ b/drivers/md/dm.h @@ -103,12 +103,16 @@ int dm_setup_md_queue(struct mapped_device *md, struct dm_table *t); */ int dm_set_zones_restrictions(struct dm_table *t, struct request_queue *q, struct queue_limits *lim); +int dm_revalidate_zones(struct dm_table *t, struct request_queue *q); void dm_zone_endio(struct dm_io *io, struct bio *clone); #ifdef CONFIG_BLK_DEV_ZONED int dm_blk_report_zones(struct gendisk *disk, sector_t sector, unsigned int nr_zones, report_zones_cb cb, void *data); bool dm_is_zone_write(struct mapped_device *md, struct bio *bio); int dm_zone_map_bio(struct dm_target_io *io); +int dm_zone_get_reset_bitmap(struct mapped_device *md, struct dm_table *t, + sector_t sector, unsigned int nr_zones, + unsigned long *need_reset); #else #define dm_blk_report_zones NULL static inline bool dm_is_zone_write(struct mapped_device *md, struct bio *bio) diff --git a/drivers/md/md-bitmap.c b/drivers/md/md-bitmap.c index 0a2d37eb38ef..08232d8dc815 100644 --- a/drivers/md/md-bitmap.c +++ b/drivers/md/md-bitmap.c @@ -227,6 +227,8 @@ static int __write_sb_page(struct md_rdev *rdev, struct bitmap *bitmap, struct block_device *bdev; struct mddev *mddev = bitmap->mddev; struct bitmap_storage *store = &bitmap->storage; + unsigned int bitmap_limit = (bitmap->storage.file_pages - pg_index) << + PAGE_SHIFT; loff_t sboff, offset = mddev->bitmap_info.offset; sector_t ps = pg_index * PAGE_SIZE / SECTOR_SIZE; unsigned int size = PAGE_SIZE; @@ -269,11 +271,9 @@ static int __write_sb_page(struct md_rdev *rdev, struct bitmap *bitmap, if (size == 0) /* bitmap runs in to data */ return -EINVAL; - } else { - /* DATA METADATA BITMAP - no problems */ } - md_super_write(mddev, rdev, sboff + ps, (int) size, page); + md_super_write(mddev, rdev, sboff + ps, (int)min(size, bitmap_limit), page); return 0; } diff --git a/drivers/md/md-cluster.c b/drivers/md/md-cluster.c index 8e36a0feec09..1d0db62f0351 100644 --- a/drivers/md/md-cluster.c +++ b/drivers/md/md-cluster.c @@ -15,6 +15,7 @@ #define LVB_SIZE 64 #define NEW_DEV_TIMEOUT 5000 +#define WAIT_DLM_LOCK_TIMEOUT (30 * HZ) struct dlm_lock_resource { dlm_lockspace_t *ls; @@ -56,6 +57,7 @@ struct resync_info { #define MD_CLUSTER_ALREADY_IN_CLUSTER 6 #define MD_CLUSTER_PENDING_RECV_EVENT 7 #define MD_CLUSTER_HOLDING_MUTEX_FOR_RECVD 8 +#define MD_CLUSTER_WAITING_FOR_SYNC 9 struct md_cluster_info { struct mddev *mddev; /* the md device which md_cluster_info belongs to */ @@ -91,6 +93,7 @@ struct md_cluster_info { sector_t sync_hi; }; +/* For compatibility, add the new msg_type at the end. */ enum msg_type { METADATA_UPDATED = 0, RESYNCING, @@ -100,6 +103,7 @@ enum msg_type { BITMAP_NEEDS_SYNC, CHANGE_CAPACITY, BITMAP_RESIZE, + RESYNCING_START, }; struct cluster_msg { @@ -130,8 +134,13 @@ static int dlm_lock_sync(struct dlm_lock_resource *res, int mode) 0, sync_ast, res, res->bast); if (ret) return ret; - wait_event(res->sync_locking, res->sync_locking_done); + ret = wait_event_timeout(res->sync_locking, res->sync_locking_done, + WAIT_DLM_LOCK_TIMEOUT); res->sync_locking_done = false; + if (!ret) { + pr_err("locking DLM '%s' timeout!\n", res->name); + return -EBUSY; + } if (res->lksb.sb_status == 0) res->mode = mode; return res->lksb.sb_status; @@ -455,6 +464,7 @@ static void process_suspend_info(struct mddev *mddev, clear_bit(MD_RESYNCING_REMOTE, &mddev->recovery); remove_suspend_info(mddev, slot); set_bit(MD_RECOVERY_NEEDED, &mddev->recovery); + clear_bit(MD_CLUSTER_WAITING_FOR_SYNC, &cinfo->state); md_wakeup_thread(mddev->thread); return; } @@ -525,6 +535,7 @@ static int process_add_new_disk(struct mddev *mddev, struct cluster_msg *cmsg) res = -1; } clear_bit(MD_CLUSTER_WAITING_FOR_NEWDISK, &cinfo->state); + set_bit(MD_CLUSTER_WAITING_FOR_SYNC, &cinfo->state); return res; } @@ -593,6 +604,9 @@ static int process_recvd_msg(struct mddev *mddev, struct cluster_msg *msg) case CHANGE_CAPACITY: set_capacity_and_notify(mddev->gendisk, mddev->array_sectors); break; + case RESYNCING_START: + clear_bit(MD_CLUSTER_WAITING_FOR_SYNC, &mddev->cluster_info->state); + break; case RESYNCING: set_bit(MD_RESYNCING_REMOTE, &mddev->recovery); process_suspend_info(mddev, le32_to_cpu(msg->slot), @@ -743,7 +757,7 @@ static void unlock_comm(struct md_cluster_info *cinfo) */ static int __sendmsg(struct md_cluster_info *cinfo, struct cluster_msg *cmsg) { - int error; + int error, unlock_error; int slot = cinfo->slot_number - 1; cmsg->slot = cpu_to_le32(slot); @@ -751,7 +765,7 @@ static int __sendmsg(struct md_cluster_info *cinfo, struct cluster_msg *cmsg) error = dlm_lock_sync(cinfo->message_lockres, DLM_LOCK_EX); if (error) { pr_err("md-cluster: failed to get EX on MESSAGE (%d)\n", error); - goto failed_message; + return error; } memcpy(cinfo->message_lockres->lksb.sb_lvbptr, (void *)cmsg, @@ -781,14 +795,10 @@ static int __sendmsg(struct md_cluster_info *cinfo, struct cluster_msg *cmsg) } failed_ack: - error = dlm_unlock_sync(cinfo->message_lockres); - if (unlikely(error != 0)) { + while ((unlock_error = dlm_unlock_sync(cinfo->message_lockres))) pr_err("md-cluster: failed convert to NL on MESSAGE(%d)\n", - error); - /* in case the message can't be released due to some reason */ - goto failed_ack; - } -failed_message: + unlock_error); + return error; } @@ -887,7 +897,7 @@ static int join(struct mddev *mddev, int nodes) memset(str, 0, 64); sprintf(str, "%pU", mddev->uuid); ret = dlm_new_lockspace(str, mddev->bitmap_info.cluster_name, - 0, LVB_SIZE, &md_ls_ops, mddev, + DLM_LSFL_SOFTIRQ, LVB_SIZE, &md_ls_ops, mddev, &ops_rv, &cinfo->lockspace); if (ret) goto err; @@ -1343,6 +1353,23 @@ static void resync_info_get(struct mddev *mddev, sector_t *lo, sector_t *hi) spin_unlock_irq(&cinfo->suspend_lock); } +static int resync_status_get(struct mddev *mddev) +{ + struct md_cluster_info *cinfo = mddev->cluster_info; + + return test_bit(MD_CLUSTER_WAITING_FOR_SYNC, &cinfo->state); +} + +static int resync_start_notify(struct mddev *mddev) +{ + struct md_cluster_info *cinfo = mddev->cluster_info; + struct cluster_msg cmsg = {0}; + + cmsg.type = cpu_to_le32(RESYNCING_START); + + return sendmsg(cinfo, &cmsg, 0); +} + static int resync_info_update(struct mddev *mddev, sector_t lo, sector_t hi) { struct md_cluster_info *cinfo = mddev->cluster_info; @@ -1570,13 +1597,15 @@ out: return err; } -static struct md_cluster_operations cluster_ops = { +static const struct md_cluster_operations cluster_ops = { .join = join, .leave = leave, .slot_number = slot_number, .resync_start = resync_start, .resync_finish = resync_finish, .resync_info_update = resync_info_update, + .resync_start_notify = resync_start_notify, + .resync_status_get = resync_status_get, .resync_info_get = resync_info_get, .metadata_update_start = metadata_update_start, .metadata_update_finish = metadata_update_finish, diff --git a/drivers/md/md-cluster.h b/drivers/md/md-cluster.h index a78e3021775d..470bf18ffde5 100644 --- a/drivers/md/md-cluster.h +++ b/drivers/md/md-cluster.h @@ -14,6 +14,8 @@ struct md_cluster_operations { int (*leave)(struct mddev *mddev); int (*slot_number)(struct mddev *mddev); int (*resync_info_update)(struct mddev *mddev, sector_t lo, sector_t hi); + int (*resync_start_notify)(struct mddev *mddev); + int (*resync_status_get)(struct mddev *mddev); void (*resync_info_get)(struct mddev *mddev, sector_t *lo, sector_t *hi); int (*metadata_update_start)(struct mddev *mddev); int (*metadata_update_finish)(struct mddev *mddev); diff --git a/drivers/md/md.c b/drivers/md/md.c index aff9118ff697..d3a837506a36 100644 --- a/drivers/md/md.c +++ b/drivers/md/md.c @@ -69,13 +69,23 @@ #include "md-bitmap.h" #include "md-cluster.h" +static const char *action_name[NR_SYNC_ACTIONS] = { + [ACTION_RESYNC] = "resync", + [ACTION_RECOVER] = "recover", + [ACTION_CHECK] = "check", + [ACTION_REPAIR] = "repair", + [ACTION_RESHAPE] = "reshape", + [ACTION_FROZEN] = "frozen", + [ACTION_IDLE] = "idle", +}; + /* pers_list is a list of registered personalities protected by pers_lock. */ static LIST_HEAD(pers_list); static DEFINE_SPINLOCK(pers_lock); static const struct kobj_type md_ktype; -struct md_cluster_operations *md_cluster_ops; +const struct md_cluster_operations *md_cluster_ops; EXPORT_SYMBOL(md_cluster_ops); static struct module *md_cluster_mod; @@ -479,7 +489,6 @@ int mddev_suspend(struct mddev *mddev, bool interruptible) */ WRITE_ONCE(mddev->suspended, mddev->suspended + 1); - del_timer_sync(&mddev->safemode_timer); /* restrict memory reclaim I/O during raid array is suspend */ mddev->noio_flag = memalloc_noio_save(); @@ -550,13 +559,9 @@ static void md_end_flush(struct bio *bio) rdev_dec_pending(rdev, mddev); - if (atomic_dec_and_test(&mddev->flush_pending)) { - /* The pair is percpu_ref_get() from md_flush_request() */ - percpu_ref_put(&mddev->active_io); - + if (atomic_dec_and_test(&mddev->flush_pending)) /* The pre-request flush has finished */ queue_work(md_wq, &mddev->flush_work); - } } static void md_submit_flush_data(struct work_struct *ws); @@ -587,12 +592,8 @@ static void submit_flushes(struct work_struct *ws) rcu_read_lock(); } rcu_read_unlock(); - if (atomic_dec_and_test(&mddev->flush_pending)) { - /* The pair is percpu_ref_get() from md_flush_request() */ - percpu_ref_put(&mddev->active_io); - + if (atomic_dec_and_test(&mddev->flush_pending)) queue_work(md_wq, &mddev->flush_work); - } } static void md_submit_flush_data(struct work_struct *ws) @@ -617,8 +618,20 @@ static void md_submit_flush_data(struct work_struct *ws) bio_endio(bio); } else { bio->bi_opf &= ~REQ_PREFLUSH; - md_handle_request(mddev, bio); + + /* + * make_requst() will never return error here, it only + * returns error in raid5_make_request() by dm-raid. + * Since dm always splits data and flush operation into + * two separate io, io size of flush submitted by dm + * always is 0, make_request() will not be called here. + */ + if (WARN_ON_ONCE(!mddev->pers->make_request(mddev, bio))) + bio_io_error(bio); } + + /* The pair is percpu_ref_get() from md_flush_request() */ + percpu_ref_put(&mddev->active_io); } /* @@ -654,24 +667,22 @@ bool md_flush_request(struct mddev *mddev, struct bio *bio) WARN_ON(percpu_ref_is_zero(&mddev->active_io)); percpu_ref_get(&mddev->active_io); mddev->flush_bio = bio; - bio = NULL; - } - spin_unlock_irq(&mddev->lock); - - if (!bio) { + spin_unlock_irq(&mddev->lock); INIT_WORK(&mddev->flush_work, submit_flushes); queue_work(md_wq, &mddev->flush_work); - } else { - /* flush was performed for some other bio while we waited. */ - if (bio->bi_iter.bi_size == 0) - /* an empty barrier - all done */ - bio_endio(bio); - else { - bio->bi_opf &= ~REQ_PREFLUSH; - return false; - } + return true; } - return true; + + /* flush was performed for some other bio while we waited. */ + spin_unlock_irq(&mddev->lock); + if (bio->bi_iter.bi_size == 0) { + /* pure flush without data - all done */ + bio_endio(bio); + return true; + } + + bio->bi_opf &= ~REQ_PREFLUSH; + return false; } EXPORT_SYMBOL(md_flush_request); @@ -742,7 +753,6 @@ int mddev_init(struct mddev *mddev) mutex_init(&mddev->open_mutex); mutex_init(&mddev->reconfig_mutex); - mutex_init(&mddev->sync_mutex); mutex_init(&mddev->suspend_mutex); mutex_init(&mddev->bitmap_info.mutex); INIT_LIST_HEAD(&mddev->disks); @@ -758,7 +768,7 @@ int mddev_init(struct mddev *mddev) init_waitqueue_head(&mddev->recovery_wait); mddev->reshape_position = MaxSector; mddev->reshape_backwards = 0; - mddev->last_sync_action = "none"; + mddev->last_sync_action = ACTION_IDLE; mddev->resync_min = 0; mddev->resync_max = MaxSector; mddev->level = LEVEL_NONE; @@ -2410,36 +2420,10 @@ static LIST_HEAD(pending_raid_disks); */ int md_integrity_register(struct mddev *mddev) { - struct md_rdev *rdev, *reference = NULL; - if (list_empty(&mddev->disks)) return 0; /* nothing to do */ - if (mddev_is_dm(mddev) || blk_get_integrity(mddev->gendisk)) - return 0; /* shouldn't register, or already is */ - rdev_for_each(rdev, mddev) { - /* skip spares and non-functional disks */ - if (test_bit(Faulty, &rdev->flags)) - continue; - if (rdev->raid_disk < 0) - continue; - if (!reference) { - /* Use the first rdev as the reference */ - reference = rdev; - continue; - } - /* does this rdev's profile match the reference profile? */ - if (blk_integrity_compare(reference->bdev->bd_disk, - rdev->bdev->bd_disk) < 0) - return -EINVAL; - } - if (!reference || !bdev_get_integrity(reference->bdev)) - return 0; - /* - * All component devices are integrity capable and have matching - * profiles, register the common profile for the md device. - */ - blk_integrity_register(mddev->gendisk, - bdev_get_integrity(reference->bdev)); + if (mddev_is_dm(mddev) || !blk_get_integrity(mddev->gendisk)) + return 0; /* shouldn't register */ pr_debug("md: data integrity enabled on %s\n", mdname(mddev)); if (bioset_integrity_create(&mddev->bio_set, BIO_POOL_SIZE) || @@ -2459,32 +2443,6 @@ int md_integrity_register(struct mddev *mddev) } EXPORT_SYMBOL(md_integrity_register); -/* - * Attempt to add an rdev, but only if it is consistent with the current - * integrity profile - */ -int md_integrity_add_rdev(struct md_rdev *rdev, struct mddev *mddev) -{ - struct blk_integrity *bi_mddev; - - if (mddev_is_dm(mddev)) - return 0; - - bi_mddev = blk_get_integrity(mddev->gendisk); - - if (!bi_mddev) /* nothing to do */ - return 0; - - if (blk_integrity_compare(mddev->gendisk, rdev->bdev->bd_disk) != 0) { - pr_err("%s: incompatible integrity profile for %pg\n", - mdname(mddev), rdev->bdev); - return -ENXIO; - } - - return 0; -} -EXPORT_SYMBOL(md_integrity_add_rdev); - static bool rdev_read_only(struct md_rdev *rdev) { return bdev_read_only(rdev->bdev) || @@ -4867,30 +4825,81 @@ out_unlock: static struct md_sysfs_entry md_metadata = __ATTR_PREALLOC(metadata_version, S_IRUGO|S_IWUSR, metadata_show, metadata_store); -static ssize_t -action_show(struct mddev *mddev, char *page) +enum sync_action md_sync_action(struct mddev *mddev) { - char *type = "idle"; unsigned long recovery = mddev->recovery; + + /* + * frozen has the highest priority, means running sync_thread will be + * stopped immediately, and no new sync_thread can start. + */ if (test_bit(MD_RECOVERY_FROZEN, &recovery)) - type = "frozen"; - else if (test_bit(MD_RECOVERY_RUNNING, &recovery) || - (md_is_rdwr(mddev) && test_bit(MD_RECOVERY_NEEDED, &recovery))) { - if (test_bit(MD_RECOVERY_RESHAPE, &recovery)) - type = "reshape"; - else if (test_bit(MD_RECOVERY_SYNC, &recovery)) { - if (!test_bit(MD_RECOVERY_REQUESTED, &recovery)) - type = "resync"; - else if (test_bit(MD_RECOVERY_CHECK, &recovery)) - type = "check"; - else - type = "repair"; - } else if (test_bit(MD_RECOVERY_RECOVER, &recovery)) - type = "recover"; - else if (mddev->reshape_position != MaxSector) - type = "reshape"; + return ACTION_FROZEN; + + /* + * read-only array can't register sync_thread, and it can only + * add/remove spares. + */ + if (!md_is_rdwr(mddev)) + return ACTION_IDLE; + + /* + * idle means no sync_thread is running, and no new sync_thread is + * requested. + */ + if (!test_bit(MD_RECOVERY_RUNNING, &recovery) && + !test_bit(MD_RECOVERY_NEEDED, &recovery)) + return ACTION_IDLE; + + if (test_bit(MD_RECOVERY_RESHAPE, &recovery) || + mddev->reshape_position != MaxSector) + return ACTION_RESHAPE; + + if (test_bit(MD_RECOVERY_RECOVER, &recovery)) + return ACTION_RECOVER; + + if (test_bit(MD_RECOVERY_SYNC, &recovery)) { + /* + * MD_RECOVERY_CHECK must be paired with + * MD_RECOVERY_REQUESTED. + */ + if (test_bit(MD_RECOVERY_CHECK, &recovery)) + return ACTION_CHECK; + if (test_bit(MD_RECOVERY_REQUESTED, &recovery)) + return ACTION_REPAIR; + return ACTION_RESYNC; } - return sprintf(page, "%s\n", type); + + /* + * MD_RECOVERY_NEEDED or MD_RECOVERY_RUNNING is set, however, no + * sync_action is specified. + */ + return ACTION_IDLE; +} + +enum sync_action md_sync_action_by_name(const char *page) +{ + enum sync_action action; + + for (action = 0; action < NR_SYNC_ACTIONS; ++action) { + if (cmd_match(page, action_name[action])) + return action; + } + + return NR_SYNC_ACTIONS; +} + +const char *md_sync_action_name(enum sync_action action) +{ + return action_name[action]; +} + +static ssize_t +action_show(struct mddev *mddev, char *page) +{ + enum sync_action action = md_sync_action(mddev); + + return sprintf(page, "%s\n", md_sync_action_name(action)); } /** @@ -4899,15 +4908,10 @@ action_show(struct mddev *mddev, char *page) * @locked: if set, reconfig_mutex will still be held after this function * return; if not set, reconfig_mutex will be released after this * function return. - * @check_seq: if set, only wait for curent running sync_thread to stop, noted - * that new sync_thread can still start. */ -static void stop_sync_thread(struct mddev *mddev, bool locked, bool check_seq) +static void stop_sync_thread(struct mddev *mddev, bool locked) { - int sync_seq; - - if (check_seq) - sync_seq = atomic_read(&mddev->sync_seq); + int sync_seq = atomic_read(&mddev->sync_seq); if (!test_bit(MD_RECOVERY_RUNNING, &mddev->recovery)) { if (!locked) @@ -4928,7 +4932,8 @@ static void stop_sync_thread(struct mddev *mddev, bool locked, bool check_seq) wait_event(resync_wait, !test_bit(MD_RECOVERY_RUNNING, &mddev->recovery) || - (check_seq && sync_seq != atomic_read(&mddev->sync_seq))); + (!test_bit(MD_RECOVERY_FROZEN, &mddev->recovery) && + sync_seq != atomic_read(&mddev->sync_seq))); if (locked) mddev_lock_nointr(mddev); @@ -4939,7 +4944,7 @@ void md_idle_sync_thread(struct mddev *mddev) lockdep_assert_held(&mddev->reconfig_mutex); clear_bit(MD_RECOVERY_FROZEN, &mddev->recovery); - stop_sync_thread(mddev, true, true); + stop_sync_thread(mddev, true); } EXPORT_SYMBOL_GPL(md_idle_sync_thread); @@ -4948,7 +4953,7 @@ void md_frozen_sync_thread(struct mddev *mddev) lockdep_assert_held(&mddev->reconfig_mutex); set_bit(MD_RECOVERY_FROZEN, &mddev->recovery); - stop_sync_thread(mddev, true, false); + stop_sync_thread(mddev, true); } EXPORT_SYMBOL_GPL(md_frozen_sync_thread); @@ -4963,100 +4968,127 @@ void md_unfrozen_sync_thread(struct mddev *mddev) } EXPORT_SYMBOL_GPL(md_unfrozen_sync_thread); -static void idle_sync_thread(struct mddev *mddev) +static int mddev_start_reshape(struct mddev *mddev) { - mutex_lock(&mddev->sync_mutex); - clear_bit(MD_RECOVERY_FROZEN, &mddev->recovery); - - if (mddev_lock(mddev)) { - mutex_unlock(&mddev->sync_mutex); - return; - } - - stop_sync_thread(mddev, false, true); - mutex_unlock(&mddev->sync_mutex); -} + int ret; -static void frozen_sync_thread(struct mddev *mddev) -{ - mutex_lock(&mddev->sync_mutex); - set_bit(MD_RECOVERY_FROZEN, &mddev->recovery); + if (mddev->pers->start_reshape == NULL) + return -EINVAL; - if (mddev_lock(mddev)) { - mutex_unlock(&mddev->sync_mutex); - return; + if (mddev->reshape_position == MaxSector || + mddev->pers->check_reshape == NULL || + mddev->pers->check_reshape(mddev)) { + clear_bit(MD_RECOVERY_FROZEN, &mddev->recovery); + ret = mddev->pers->start_reshape(mddev); + if (ret) + return ret; + } else { + /* + * If reshape is still in progress, and md_check_recovery() can + * continue to reshape, don't restart reshape because data can + * be corrupted for raid456. + */ + clear_bit(MD_RECOVERY_FROZEN, &mddev->recovery); } - stop_sync_thread(mddev, false, false); - mutex_unlock(&mddev->sync_mutex); + sysfs_notify_dirent_safe(mddev->sysfs_degraded); + return 0; } static ssize_t action_store(struct mddev *mddev, const char *page, size_t len) { + int ret; + enum sync_action action; + if (!mddev->pers || !mddev->pers->sync_request) return -EINVAL; +retry: + if (work_busy(&mddev->sync_work)) + flush_work(&mddev->sync_work); - if (cmd_match(page, "idle")) - idle_sync_thread(mddev); - else if (cmd_match(page, "frozen")) - frozen_sync_thread(mddev); - else if (test_bit(MD_RECOVERY_RUNNING, &mddev->recovery)) - return -EBUSY; - else if (cmd_match(page, "resync")) - clear_bit(MD_RECOVERY_FROZEN, &mddev->recovery); - else if (cmd_match(page, "recover")) { - clear_bit(MD_RECOVERY_FROZEN, &mddev->recovery); - set_bit(MD_RECOVERY_RECOVER, &mddev->recovery); - } else if (cmd_match(page, "reshape")) { - int err; - if (mddev->pers->start_reshape == NULL) - return -EINVAL; - err = mddev_lock(mddev); - if (!err) { - if (test_bit(MD_RECOVERY_RUNNING, &mddev->recovery)) { - err = -EBUSY; - } else if (mddev->reshape_position == MaxSector || - mddev->pers->check_reshape == NULL || - mddev->pers->check_reshape(mddev)) { - clear_bit(MD_RECOVERY_FROZEN, &mddev->recovery); - err = mddev->pers->start_reshape(mddev); - } else { - /* - * If reshape is still in progress, and - * md_check_recovery() can continue to reshape, - * don't restart reshape because data can be - * corrupted for raid456. - */ - clear_bit(MD_RECOVERY_FROZEN, &mddev->recovery); - } - mddev_unlock(mddev); + ret = mddev_lock(mddev); + if (ret) + return ret; + + if (work_busy(&mddev->sync_work)) { + mddev_unlock(mddev); + goto retry; + } + + action = md_sync_action_by_name(page); + + /* TODO: mdadm rely on "idle" to start sync_thread. */ + if (test_bit(MD_RECOVERY_RUNNING, &mddev->recovery)) { + switch (action) { + case ACTION_FROZEN: + md_frozen_sync_thread(mddev); + ret = len; + goto out; + case ACTION_IDLE: + md_idle_sync_thread(mddev); + break; + case ACTION_RESHAPE: + case ACTION_RECOVER: + case ACTION_CHECK: + case ACTION_REPAIR: + case ACTION_RESYNC: + ret = -EBUSY; + goto out; + default: + ret = -EINVAL; + goto out; } - if (err) - return err; - sysfs_notify_dirent_safe(mddev->sysfs_degraded); } else { - if (cmd_match(page, "check")) + switch (action) { + case ACTION_FROZEN: + set_bit(MD_RECOVERY_FROZEN, &mddev->recovery); + ret = len; + goto out; + case ACTION_RESHAPE: + clear_bit(MD_RECOVERY_FROZEN, &mddev->recovery); + ret = mddev_start_reshape(mddev); + if (ret) + goto out; + break; + case ACTION_RECOVER: + clear_bit(MD_RECOVERY_FROZEN, &mddev->recovery); + set_bit(MD_RECOVERY_RECOVER, &mddev->recovery); + break; + case ACTION_CHECK: set_bit(MD_RECOVERY_CHECK, &mddev->recovery); - else if (!cmd_match(page, "repair")) - return -EINVAL; - clear_bit(MD_RECOVERY_FROZEN, &mddev->recovery); - set_bit(MD_RECOVERY_REQUESTED, &mddev->recovery); - set_bit(MD_RECOVERY_SYNC, &mddev->recovery); + fallthrough; + case ACTION_REPAIR: + set_bit(MD_RECOVERY_REQUESTED, &mddev->recovery); + set_bit(MD_RECOVERY_SYNC, &mddev->recovery); + fallthrough; + case ACTION_RESYNC: + case ACTION_IDLE: + clear_bit(MD_RECOVERY_FROZEN, &mddev->recovery); + break; + default: + ret = -EINVAL; + goto out; + } } + if (mddev->ro == MD_AUTO_READ) { /* A write to sync_action is enough to justify * canceling read-auto mode */ - flush_work(&mddev->sync_work); mddev->ro = MD_RDWR; md_wakeup_thread(mddev->sync_thread); } + set_bit(MD_RECOVERY_NEEDED, &mddev->recovery); md_wakeup_thread(mddev->thread); sysfs_notify_dirent_safe(mddev->sysfs_action); - return len; + ret = len; + +out: + mddev_unlock(mddev); + return ret; } static struct md_sysfs_entry md_scan_mode = @@ -5065,7 +5097,8 @@ __ATTR_PREALLOC(sync_action, S_IRUGO|S_IWUSR, action_show, action_store); static ssize_t last_sync_action_show(struct mddev *mddev, char *page) { - return sprintf(page, "%s\n", mddev->last_sync_action); + return sprintf(page, "%s\n", + md_sync_action_name(mddev->last_sync_action)); } static struct md_sysfs_entry md_last_scan_mode = __ATTR_RO(last_sync_action); @@ -5755,14 +5788,20 @@ static const struct kobj_type md_ktype = { int mdp_major = 0; /* stack the limit for all rdevs into lim */ -void mddev_stack_rdev_limits(struct mddev *mddev, struct queue_limits *lim) +int mddev_stack_rdev_limits(struct mddev *mddev, struct queue_limits *lim, + unsigned int flags) { struct md_rdev *rdev; rdev_for_each(rdev, mddev) { queue_limits_stack_bdev(lim, rdev->bdev, rdev->data_offset, mddev->gendisk->disk_name); + if ((flags & MDDEV_STACK_INTEGRITY) && + !queue_limits_stack_integrity_bdev(lim, rdev->bdev)) + return -EINVAL; } + + return 0; } EXPORT_SYMBOL_GPL(mddev_stack_rdev_limits); @@ -5777,6 +5816,14 @@ int mddev_stack_new_rdev(struct mddev *mddev, struct md_rdev *rdev) lim = queue_limits_start_update(mddev->gendisk->queue); queue_limits_stack_bdev(&lim, rdev->bdev, rdev->data_offset, mddev->gendisk->disk_name); + + if (!queue_limits_stack_integrity_bdev(&lim, rdev->bdev)) { + pr_err("%s: incompatible integrity profile for %pg\n", + mdname(mddev), rdev->bdev); + queue_limits_cancel_update(mddev->gendisk->queue); + return -ENXIO; + } + return queue_limits_commit_update(mddev->gendisk->queue, &lim); } EXPORT_SYMBOL_GPL(mddev_stack_new_rdev); @@ -5806,6 +5853,14 @@ static void mddev_delayed_delete(struct work_struct *ws) kobject_put(&mddev->kobj); } +void md_init_stacking_limits(struct queue_limits *lim) +{ + blk_set_stacking_limits(lim); + lim->features = BLK_FEAT_WRITE_CACHE | BLK_FEAT_FUA | + BLK_FEAT_IO_STAT | BLK_FEAT_NOWAIT; +} +EXPORT_SYMBOL_GPL(md_init_stacking_limits); + struct mddev *md_alloc(dev_t dev, char *name) { /* @@ -5823,7 +5878,7 @@ struct mddev *md_alloc(dev_t dev, char *name) int partitioned; int shift; int unit; - int error ; + int error; /* * Wait for any previous instance of this device to be completely @@ -5881,7 +5936,6 @@ struct mddev *md_alloc(dev_t dev, char *name) disk->fops = &md_fops; disk->private_data = mddev; - blk_queue_write_cache(disk->queue, true, true); disk->events |= DISK_EVENT_MEDIA_CHANGE; mddev->gendisk = disk; error = add_disk(disk); @@ -6185,28 +6239,6 @@ int md_run(struct mddev *mddev) } } - if (!mddev_is_dm(mddev)) { - struct request_queue *q = mddev->gendisk->queue; - bool nonrot = true; - - rdev_for_each(rdev, mddev) { - if (rdev->raid_disk >= 0 && !bdev_nonrot(rdev->bdev)) { - nonrot = false; - break; - } - } - if (mddev->degraded) - nonrot = false; - if (nonrot) - blk_queue_flag_set(QUEUE_FLAG_NONROT, q); - else - blk_queue_flag_clear(QUEUE_FLAG_NONROT, q); - blk_queue_flag_set(QUEUE_FLAG_IO_STAT, q); - - /* Set the NOWAIT flags if all underlying devices support it */ - if (nowait) - blk_queue_flag_set(QUEUE_FLAG_NOWAIT, q); - } if (pers->sync_request) { if (mddev->kobj.sd && sysfs_create_group(&mddev->kobj, &md_redundancy_group)) @@ -6437,7 +6469,7 @@ void md_stop_writes(struct mddev *mddev) { mddev_lock_nointr(mddev); set_bit(MD_RECOVERY_FROZEN, &mddev->recovery); - stop_sync_thread(mddev, true, false); + stop_sync_thread(mddev, true); __md_stop_writes(mddev); mddev_unlock(mddev); } @@ -6505,7 +6537,7 @@ static int md_set_readonly(struct mddev *mddev) set_bit(MD_RECOVERY_FROZEN, &mddev->recovery); } - stop_sync_thread(mddev, false, false); + stop_sync_thread(mddev, false); wait_event(mddev->sb_wait, !test_bit(MD_SB_CHANGE_PENDING, &mddev->sb_flags)); mddev_lock_nointr(mddev); @@ -6551,7 +6583,7 @@ static int do_md_stop(struct mddev *mddev, int mode) set_bit(MD_RECOVERY_FROZEN, &mddev->recovery); } - stop_sync_thread(mddev, true, false); + stop_sync_thread(mddev, true); if (mddev->sysfs_active || test_bit(MD_RECOVERY_RUNNING, &mddev->recovery)) { @@ -7166,15 +7198,6 @@ static int hot_add_disk(struct mddev *mddev, dev_t dev) if (!mddev->thread) md_update_sb(mddev, 1); /* - * If the new disk does not support REQ_NOWAIT, - * disable on the whole MD. - */ - if (!bdev_nowait(rdev->bdev)) { - pr_info("%s: Disabling nowait because %pg does not support nowait\n", - mdname(mddev), rdev->bdev); - blk_queue_flag_clear(QUEUE_FLAG_NOWAIT, mddev->gendisk->queue); - } - /* * Kick recovery, maybe this spare has to be added to the * array immediately. */ @@ -7742,12 +7765,6 @@ static int md_ioctl(struct block_device *bdev, blk_mode_t mode, return get_bitmap_file(mddev, argp); } - if (cmd == HOT_REMOVE_DISK) - /* need to ensure recovery thread has run */ - wait_event_interruptible_timeout(mddev->sb_wait, - !test_bit(MD_RECOVERY_NEEDED, - &mddev->recovery), - msecs_to_jiffies(5000)); if (cmd == STOP_ARRAY || cmd == STOP_ARRAY_RO) { /* Need to flush page cache, and ensure no-one else opens * and writes @@ -8520,7 +8537,7 @@ int unregister_md_personality(struct md_personality *p) } EXPORT_SYMBOL(unregister_md_personality); -int register_md_cluster_operations(struct md_cluster_operations *ops, +int register_md_cluster_operations(const struct md_cluster_operations *ops, struct module *module) { int ret = 0; @@ -8641,12 +8658,12 @@ EXPORT_SYMBOL(md_done_sync); * A return value of 'false' means that the write wasn't recorded * and cannot proceed as the array is being suspend. */ -bool md_write_start(struct mddev *mddev, struct bio *bi) +void md_write_start(struct mddev *mddev, struct bio *bi) { int did_change = 0; if (bio_data_dir(bi) != WRITE) - return true; + return; BUG_ON(mddev->ro == MD_RDONLY); if (mddev->ro == MD_AUTO_READ) { @@ -8679,15 +8696,9 @@ bool md_write_start(struct mddev *mddev, struct bio *bi) if (did_change) sysfs_notify_dirent_safe(mddev->sysfs_state); if (!mddev->has_superblocks) - return true; + return; wait_event(mddev->sb_wait, - !test_bit(MD_SB_CHANGE_PENDING, &mddev->sb_flags) || - is_md_suspended(mddev)); - if (test_bit(MD_SB_CHANGE_PENDING, &mddev->sb_flags)) { - percpu_ref_put(&mddev->writes_pending); - return false; - } - return true; + !test_bit(MD_SB_CHANGE_PENDING, &mddev->sb_flags)); } EXPORT_SYMBOL(md_write_start); @@ -8835,6 +8846,77 @@ void md_allow_write(struct mddev *mddev) } EXPORT_SYMBOL_GPL(md_allow_write); +static sector_t md_sync_max_sectors(struct mddev *mddev, + enum sync_action action) +{ + switch (action) { + case ACTION_RESYNC: + case ACTION_CHECK: + case ACTION_REPAIR: + atomic64_set(&mddev->resync_mismatches, 0); + fallthrough; + case ACTION_RESHAPE: + return mddev->resync_max_sectors; + case ACTION_RECOVER: + return mddev->dev_sectors; + default: + return 0; + } +} + +static sector_t md_sync_position(struct mddev *mddev, enum sync_action action) +{ + sector_t start = 0; + struct md_rdev *rdev; + + switch (action) { + case ACTION_CHECK: + case ACTION_REPAIR: + return mddev->resync_min; + case ACTION_RESYNC: + if (!mddev->bitmap) + return mddev->recovery_cp; + return 0; + case ACTION_RESHAPE: + /* + * If the original node aborts reshaping then we continue the + * reshaping, so set again to avoid restart reshape from the + * first beginning + */ + if (mddev_is_clustered(mddev) && + mddev->reshape_position != MaxSector) + return mddev->reshape_position; + return 0; + case ACTION_RECOVER: + start = MaxSector; + rcu_read_lock(); + rdev_for_each_rcu(rdev, mddev) + if (rdev->raid_disk >= 0 && + !test_bit(Journal, &rdev->flags) && + !test_bit(Faulty, &rdev->flags) && + !test_bit(In_sync, &rdev->flags) && + rdev->recovery_offset < start) + start = rdev->recovery_offset; + rcu_read_unlock(); + + /* If there is a bitmap, we need to make sure all + * writes that started before we added a spare + * complete before we start doing a recovery. + * Otherwise the write might complete and (via + * bitmap_endwrite) set a bit in the bitmap after the + * recovery has checked that bit and skipped that + * region. + */ + if (mddev->bitmap) { + mddev->pers->quiesce(mddev, 1); + mddev->pers->quiesce(mddev, 0); + } + return start; + default: + return MaxSector; + } +} + #define SYNC_MARKS 10 #define SYNC_MARK_STEP (3*HZ) #define UPDATE_FREQUENCY (5*60*HZ) @@ -8851,7 +8933,8 @@ void md_do_sync(struct md_thread *thread) sector_t last_check; int skipped = 0; struct md_rdev *rdev; - char *desc, *action = NULL; + enum sync_action action; + const char *desc; struct blk_plug plug; int ret; @@ -8882,21 +8965,9 @@ void md_do_sync(struct md_thread *thread) goto skip; } - if (test_bit(MD_RECOVERY_SYNC, &mddev->recovery)) { - if (test_bit(MD_RECOVERY_CHECK, &mddev->recovery)) { - desc = "data-check"; - action = "check"; - } else if (test_bit(MD_RECOVERY_REQUESTED, &mddev->recovery)) { - desc = "requested-resync"; - action = "repair"; - } else - desc = "resync"; - } else if (test_bit(MD_RECOVERY_RESHAPE, &mddev->recovery)) - desc = "reshape"; - else - desc = "recovery"; - - mddev->last_sync_action = action ?: desc; + action = md_sync_action(mddev); + desc = md_sync_action_name(action); + mddev->last_sync_action = action; /* * Before starting a resync we must have set curr_resync to @@ -8907,7 +8978,8 @@ void md_do_sync(struct md_thread *thread) * This will mean we have to start checking from the beginning again. * */ - + if (mddev_is_clustered(mddev)) + md_cluster_ops->resync_start_notify(mddev); do { int mddev2_minor = -1; mddev->curr_resync = MD_RESYNC_DELAYED; @@ -8964,56 +9036,8 @@ void md_do_sync(struct md_thread *thread) spin_unlock(&all_mddevs_lock); } while (mddev->curr_resync < MD_RESYNC_DELAYED); - j = 0; - if (test_bit(MD_RECOVERY_SYNC, &mddev->recovery)) { - /* resync follows the size requested by the personality, - * which defaults to physical size, but can be virtual size - */ - max_sectors = mddev->resync_max_sectors; - atomic64_set(&mddev->resync_mismatches, 0); - /* we don't use the checkpoint if there's a bitmap */ - if (test_bit(MD_RECOVERY_REQUESTED, &mddev->recovery)) - j = mddev->resync_min; - else if (!mddev->bitmap) - j = mddev->recovery_cp; - - } else if (test_bit(MD_RECOVERY_RESHAPE, &mddev->recovery)) { - max_sectors = mddev->resync_max_sectors; - /* - * If the original node aborts reshaping then we continue the - * reshaping, so set j again to avoid restart reshape from the - * first beginning - */ - if (mddev_is_clustered(mddev) && - mddev->reshape_position != MaxSector) - j = mddev->reshape_position; - } else { - /* recovery follows the physical size of devices */ - max_sectors = mddev->dev_sectors; - j = MaxSector; - rcu_read_lock(); - rdev_for_each_rcu(rdev, mddev) - if (rdev->raid_disk >= 0 && - !test_bit(Journal, &rdev->flags) && - !test_bit(Faulty, &rdev->flags) && - !test_bit(In_sync, &rdev->flags) && - rdev->recovery_offset < j) - j = rdev->recovery_offset; - rcu_read_unlock(); - - /* If there is a bitmap, we need to make sure all - * writes that started before we added a spare - * complete before we start doing a recovery. - * Otherwise the write might complete and (via - * bitmap_endwrite) set a bit in the bitmap after the - * recovery has checked that bit and skipped that - * region. - */ - if (mddev->bitmap) { - mddev->pers->quiesce(mddev, 1); - mddev->pers->quiesce(mddev, 0); - } - } + max_sectors = md_sync_max_sectors(mddev, action); + j = md_sync_position(mddev, action); pr_info("md: %s of RAID array %s\n", desc, mdname(mddev)); pr_debug("md: minimum _guaranteed_ speed: %d KB/sec/disk.\n", speed_min(mddev)); @@ -9095,7 +9119,8 @@ void md_do_sync(struct md_thread *thread) if (test_bit(MD_RECOVERY_INTR, &mddev->recovery)) break; - sectors = mddev->pers->sync_request(mddev, j, &skipped); + sectors = mddev->pers->sync_request(mddev, j, max_sectors, + &skipped); if (sectors == 0) { set_bit(MD_RECOVERY_INTR, &mddev->recovery); break; @@ -9185,7 +9210,7 @@ void md_do_sync(struct md_thread *thread) mddev->curr_resync_completed = mddev->curr_resync; sysfs_notify_dirent_safe(mddev->sysfs_completed); } - mddev->pers->sync_request(mddev, max_sectors, &skipped); + mddev->pers->sync_request(mddev, max_sectors, max_sectors, &skipped); if (!test_bit(MD_RECOVERY_CHECK, &mddev->recovery) && mddev->curr_resync > MD_RESYNC_ACTIVE) { @@ -9968,8 +9993,18 @@ static void check_sb_changes(struct mddev *mddev, struct md_rdev *rdev) */ if (rdev2->raid_disk == -1 && role != MD_DISK_ROLE_SPARE && !(le32_to_cpu(sb->feature_map) & - MD_FEATURE_RESHAPE_ACTIVE)) { - rdev2->saved_raid_disk = role; + MD_FEATURE_RESHAPE_ACTIVE) && + !md_cluster_ops->resync_status_get(mddev)) { + /* + * -1 to make raid1_add_disk() set conf->fullsync + * to 1. This could avoid skipping sync when the + * remote node is down during resyncing. + */ + if ((le32_to_cpu(sb->feature_map) + & MD_FEATURE_RECOVERY_OFFSET)) + rdev2->saved_raid_disk = -1; + else + rdev2->saved_raid_disk = role; ret = remove_and_add_spares(mddev, rdev2); pr_info("Activated spare: %pg\n", rdev2->bdev); diff --git a/drivers/md/md.h b/drivers/md/md.h index ca085ecad504..a0d6827dced9 100644 --- a/drivers/md/md.h +++ b/drivers/md/md.h @@ -34,6 +34,61 @@ */ #define MD_FAILFAST (REQ_FAILFAST_DEV | REQ_FAILFAST_TRANSPORT) +/* Status of sync thread. */ +enum sync_action { + /* + * Represent by MD_RECOVERY_SYNC, start when: + * 1) after assemble, sync data from first rdev to other copies, this + * must be done first before other sync actions and will only execute + * once; + * 2) resize the array(notice that this is not reshape), sync data for + * the new range; + */ + ACTION_RESYNC, + /* + * Represent by MD_RECOVERY_RECOVER, start when: + * 1) for new replacement, sync data based on the replace rdev or + * available copies from other rdev; + * 2) for new member disk while the array is degraded, sync data from + * other rdev; + * 3) reassemble after power failure or re-add a hot removed rdev, sync + * data from first rdev to other copies based on bitmap; + */ + ACTION_RECOVER, + /* + * Represent by MD_RECOVERY_SYNC | MD_RECOVERY_REQUESTED | + * MD_RECOVERY_CHECK, start when user echo "check" to sysfs api + * sync_action, used to check if data copies from differenct rdev are + * the same. The number of mismatch sectors will be exported to user + * by sysfs api mismatch_cnt; + */ + ACTION_CHECK, + /* + * Represent by MD_RECOVERY_SYNC | MD_RECOVERY_REQUESTED, start when + * user echo "repair" to sysfs api sync_action, usually paired with + * ACTION_CHECK, used to force syncing data once user found that there + * are inconsistent data, + */ + ACTION_REPAIR, + /* + * Represent by MD_RECOVERY_RESHAPE, start when new member disk is added + * to the conf, notice that this is different from spares or + * replacement; + */ + ACTION_RESHAPE, + /* + * Represent by MD_RECOVERY_FROZEN, can be set by sysfs api sync_action + * or internal usage like setting the array read-only, will forbid above + * actions. + */ + ACTION_FROZEN, + /* + * All above actions don't match. + */ + ACTION_IDLE, + NR_SYNC_ACTIONS, +}; + /* * The struct embedded in rdev is used to serialize IO. */ @@ -371,13 +426,12 @@ struct mddev { struct md_thread __rcu *thread; /* management thread */ struct md_thread __rcu *sync_thread; /* doing resync or reconstruct */ - /* 'last_sync_action' is initialized to "none". It is set when a - * sync operation (i.e "data-check", "requested-resync", "resync", - * "recovery", or "reshape") is started. It holds this value even + /* + * Set when a sync operation is started. It holds this value even * when the sync thread is "frozen" (interrupted) or "idle" (stopped - * or finished). It is overwritten when a new sync operation is begun. + * or finished). It is overwritten when a new sync operation is begun. */ - char *last_sync_action; + enum sync_action last_sync_action; sector_t curr_resync; /* last block scheduled */ /* As resync requests can complete out of order, we cannot easily track * how much resync has been completed. So we occasionally pause until @@ -540,8 +594,6 @@ struct mddev { */ struct list_head deleting; - /* Used to synchronize idle and frozen for action_store() */ - struct mutex sync_mutex; /* The sequence number for sync thread */ atomic_t sync_seq; @@ -551,22 +603,46 @@ struct mddev { }; enum recovery_flags { + /* flags for sync thread running status */ + + /* + * set when one of sync action is set and new sync thread need to be + * registered, or just add/remove spares from conf. + */ + MD_RECOVERY_NEEDED, + /* sync thread is running, or about to be started */ + MD_RECOVERY_RUNNING, + /* sync thread needs to be aborted for some reason */ + MD_RECOVERY_INTR, + /* sync thread is done and is waiting to be unregistered */ + MD_RECOVERY_DONE, + /* running sync thread must abort immediately, and not restart */ + MD_RECOVERY_FROZEN, + /* waiting for pers->start() to finish */ + MD_RECOVERY_WAIT, + /* interrupted because io-error */ + MD_RECOVERY_ERROR, + + /* flags determines sync action, see details in enum sync_action */ + + /* if just this flag is set, action is resync. */ + MD_RECOVERY_SYNC, + /* + * paired with MD_RECOVERY_SYNC, if MD_RECOVERY_CHECK is not set, + * action is repair, means user requested resync. + */ + MD_RECOVERY_REQUESTED, /* - * If neither SYNC or RESHAPE are set, then it is a recovery. + * paired with MD_RECOVERY_SYNC and MD_RECOVERY_REQUESTED, action is + * check. */ - MD_RECOVERY_RUNNING, /* a thread is running, or about to be started */ - MD_RECOVERY_SYNC, /* actually doing a resync, not a recovery */ - MD_RECOVERY_RECOVER, /* doing recovery, or need to try it. */ - MD_RECOVERY_INTR, /* resync needs to be aborted for some reason */ - MD_RECOVERY_DONE, /* thread is done and is waiting to be reaped */ - MD_RECOVERY_NEEDED, /* we might need to start a resync/recover */ - MD_RECOVERY_REQUESTED, /* user-space has requested a sync (used with SYNC) */ - MD_RECOVERY_CHECK, /* user-space request for check-only, no repair */ - MD_RECOVERY_RESHAPE, /* A reshape is happening */ - MD_RECOVERY_FROZEN, /* User request to abort, and not restart, any action */ - MD_RECOVERY_ERROR, /* sync-action interrupted because io-error */ - MD_RECOVERY_WAIT, /* waiting for pers->start() to finish */ - MD_RESYNCING_REMOTE, /* remote node is running resync thread */ + MD_RECOVERY_CHECK, + /* recovery, or need to try it */ + MD_RECOVERY_RECOVER, + /* reshape */ + MD_RECOVERY_RESHAPE, + /* remote node is running resync thread */ + MD_RESYNCING_REMOTE, }; enum md_ro_state { @@ -653,7 +729,8 @@ struct md_personality int (*hot_add_disk) (struct mddev *mddev, struct md_rdev *rdev); int (*hot_remove_disk) (struct mddev *mddev, struct md_rdev *rdev); int (*spare_active) (struct mddev *mddev); - sector_t (*sync_request)(struct mddev *mddev, sector_t sector_nr, int *skipped); + sector_t (*sync_request)(struct mddev *mddev, sector_t sector_nr, + sector_t max_sector, int *skipped); int (*resize) (struct mddev *mddev, sector_t sectors); sector_t (*size) (struct mddev *mddev, sector_t sectors, int raid_disks); int (*check_reshape) (struct mddev *mddev); @@ -772,7 +849,7 @@ static inline void safe_put_page(struct page *p) extern int register_md_personality(struct md_personality *p); extern int unregister_md_personality(struct md_personality *p); -extern int register_md_cluster_operations(struct md_cluster_operations *ops, +extern int register_md_cluster_operations(const struct md_cluster_operations *ops, struct module *module); extern int unregister_md_cluster_operations(void); extern int md_setup_cluster(struct mddev *mddev, int nodes); @@ -785,7 +862,10 @@ extern void md_unregister_thread(struct mddev *mddev, struct md_thread __rcu **t extern void md_wakeup_thread(struct md_thread __rcu *thread); extern void md_check_recovery(struct mddev *mddev); extern void md_reap_sync_thread(struct mddev *mddev); -extern bool md_write_start(struct mddev *mddev, struct bio *bi); +extern enum sync_action md_sync_action(struct mddev *mddev); +extern enum sync_action md_sync_action_by_name(const char *page); +extern const char *md_sync_action_name(enum sync_action action); +extern void md_write_start(struct mddev *mddev, struct bio *bi); extern void md_write_inc(struct mddev *mddev, struct bio *bi); extern void md_write_end(struct mddev *mddev); extern void md_done_sync(struct mddev *mddev, int blocks, int ok); @@ -809,11 +889,11 @@ extern void md_wait_for_blocked_rdev(struct md_rdev *rdev, struct mddev *mddev); extern void md_set_array_sectors(struct mddev *mddev, sector_t array_sectors); extern int md_check_no_bitmap(struct mddev *mddev); extern int md_integrity_register(struct mddev *mddev); -extern int md_integrity_add_rdev(struct md_rdev *rdev, struct mddev *mddev); extern int strict_strtoul_scaled(const char *cp, unsigned long *res, int scale); extern int mddev_init(struct mddev *mddev); extern void mddev_destroy(struct mddev *mddev); +void md_init_stacking_limits(struct queue_limits *lim); struct mddev *md_alloc(dev_t dev, char *name); void mddev_put(struct mddev *mddev); extern int md_run(struct mddev *mddev); @@ -852,7 +932,7 @@ static inline void rdev_dec_pending(struct md_rdev *rdev, struct mddev *mddev) } } -extern struct md_cluster_operations *md_cluster_ops; +extern const struct md_cluster_operations *md_cluster_ops; static inline int mddev_is_clustered(struct mddev *mddev) { return mddev->cluster_info && mddev->bitmap_info.nodes > 1; @@ -908,7 +988,9 @@ void md_autostart_arrays(int part); int md_set_array_info(struct mddev *mddev, struct mdu_array_info_s *info); int md_add_new_disk(struct mddev *mddev, struct mdu_disk_info_s *info); int do_md_run(struct mddev *mddev); -void mddev_stack_rdev_limits(struct mddev *mddev, struct queue_limits *lim); +#define MDDEV_STACK_INTEGRITY (1u << 0) +int mddev_stack_rdev_limits(struct mddev *mddev, struct queue_limits *lim, + unsigned int flags); int mddev_stack_new_rdev(struct mddev *mddev, struct md_rdev *rdev); void mddev_update_io_opt(struct mddev *mddev, unsigned int nr_stripes); diff --git a/drivers/md/persistent-data/dm-array.c b/drivers/md/persistent-data/dm-array.c index 798c9c53a343..157c9bd2fed7 100644 --- a/drivers/md/persistent-data/dm-array.c +++ b/drivers/md/persistent-data/dm-array.c @@ -38,7 +38,7 @@ struct array_block { */ #define CSUM_XOR 595846735 -static void array_block_prepare_for_write(struct dm_block_validator *v, +static void array_block_prepare_for_write(const struct dm_block_validator *v, struct dm_block *b, size_t size_of_block) { @@ -50,7 +50,7 @@ static void array_block_prepare_for_write(struct dm_block_validator *v, CSUM_XOR)); } -static int array_block_check(struct dm_block_validator *v, +static int array_block_check(const struct dm_block_validator *v, struct dm_block *b, size_t size_of_block) { @@ -77,7 +77,7 @@ static int array_block_check(struct dm_block_validator *v, return 0; } -static struct dm_block_validator array_validator = { +static const struct dm_block_validator array_validator = { .name = "array", .prepare_for_write = array_block_prepare_for_write, .check = array_block_check diff --git a/drivers/md/persistent-data/dm-block-manager.c b/drivers/md/persistent-data/dm-block-manager.c index b17b54df673b..1ef71e5fcde7 100644 --- a/drivers/md/persistent-data/dm-block-manager.c +++ b/drivers/md/persistent-data/dm-block-manager.c @@ -345,7 +345,7 @@ void *dm_block_data(struct dm_block *b) EXPORT_SYMBOL_GPL(dm_block_data); struct buffer_aux { - struct dm_block_validator *validator; + const struct dm_block_validator *validator; int write_locked; #ifdef CONFIG_DM_DEBUG_BLOCK_MANAGER_LOCKING @@ -441,7 +441,7 @@ dm_block_t dm_bm_nr_blocks(struct dm_block_manager *bm) static int dm_bm_validate_buffer(struct dm_block_manager *bm, struct dm_buffer *buf, struct buffer_aux *aux, - struct dm_block_validator *v) + const struct dm_block_validator *v) { if (unlikely(!aux->validator)) { int r; @@ -467,7 +467,7 @@ static int dm_bm_validate_buffer(struct dm_block_manager *bm, return 0; } int dm_bm_read_lock(struct dm_block_manager *bm, dm_block_t b, - struct dm_block_validator *v, + const struct dm_block_validator *v, struct dm_block **result) { struct buffer_aux *aux; @@ -500,7 +500,7 @@ int dm_bm_read_lock(struct dm_block_manager *bm, dm_block_t b, EXPORT_SYMBOL_GPL(dm_bm_read_lock); int dm_bm_write_lock(struct dm_block_manager *bm, - dm_block_t b, struct dm_block_validator *v, + dm_block_t b, const struct dm_block_validator *v, struct dm_block **result) { struct buffer_aux *aux; @@ -536,7 +536,7 @@ int dm_bm_write_lock(struct dm_block_manager *bm, EXPORT_SYMBOL_GPL(dm_bm_write_lock); int dm_bm_read_try_lock(struct dm_block_manager *bm, - dm_block_t b, struct dm_block_validator *v, + dm_block_t b, const struct dm_block_validator *v, struct dm_block **result) { struct buffer_aux *aux; @@ -569,7 +569,7 @@ int dm_bm_read_try_lock(struct dm_block_manager *bm, } int dm_bm_write_lock_zero(struct dm_block_manager *bm, - dm_block_t b, struct dm_block_validator *v, + dm_block_t b, const struct dm_block_validator *v, struct dm_block **result) { int r; diff --git a/drivers/md/persistent-data/dm-block-manager.h b/drivers/md/persistent-data/dm-block-manager.h index f706d3de8d5a..b1998968594c 100644 --- a/drivers/md/persistent-data/dm-block-manager.h +++ b/drivers/md/persistent-data/dm-block-manager.h @@ -51,12 +51,14 @@ dm_block_t dm_bm_nr_blocks(struct dm_block_manager *bm); */ struct dm_block_validator { const char *name; - void (*prepare_for_write)(struct dm_block_validator *v, struct dm_block *b, size_t block_size); + void (*prepare_for_write)(const struct dm_block_validator *v, + struct dm_block *b, size_t block_size); /* * Return 0 if the checksum is valid or < 0 on error. */ - int (*check)(struct dm_block_validator *v, struct dm_block *b, size_t block_size); + int (*check)(const struct dm_block_validator *v, + struct dm_block *b, size_t block_size); }; /*----------------------------------------------------------------*/ @@ -73,11 +75,11 @@ struct dm_block_validator { * written back to the disk sometime after dm_bm_unlock is called. */ int dm_bm_read_lock(struct dm_block_manager *bm, dm_block_t b, - struct dm_block_validator *v, + const struct dm_block_validator *v, struct dm_block **result); int dm_bm_write_lock(struct dm_block_manager *bm, dm_block_t b, - struct dm_block_validator *v, + const struct dm_block_validator *v, struct dm_block **result); /* @@ -85,7 +87,7 @@ int dm_bm_write_lock(struct dm_block_manager *bm, dm_block_t b, * available immediately. */ int dm_bm_read_try_lock(struct dm_block_manager *bm, dm_block_t b, - struct dm_block_validator *v, + const struct dm_block_validator *v, struct dm_block **result); /* @@ -93,7 +95,7 @@ int dm_bm_read_try_lock(struct dm_block_manager *bm, dm_block_t b, * overwrite the block completely. It saves a disk read. */ int dm_bm_write_lock_zero(struct dm_block_manager *bm, dm_block_t b, - struct dm_block_validator *v, + const struct dm_block_validator *v, struct dm_block **result); void dm_bm_unlock(struct dm_block *b); diff --git a/drivers/md/persistent-data/dm-btree-internal.h b/drivers/md/persistent-data/dm-btree-internal.h index 7ed2ce656fcc..acebd32858a7 100644 --- a/drivers/md/persistent-data/dm-btree-internal.h +++ b/drivers/md/persistent-data/dm-btree-internal.h @@ -138,7 +138,7 @@ static inline uint64_t value64(struct btree_node *n, uint32_t index) */ int lower_bound(struct btree_node *n, uint64_t key); -extern struct dm_block_validator btree_node_validator; +extern const struct dm_block_validator btree_node_validator; /* * Value type for upper levels of multi-level btrees. diff --git a/drivers/md/persistent-data/dm-btree-spine.c b/drivers/md/persistent-data/dm-btree-spine.c index 7540383b7cf3..c46fc50c274e 100644 --- a/drivers/md/persistent-data/dm-btree-spine.c +++ b/drivers/md/persistent-data/dm-btree-spine.c @@ -16,7 +16,7 @@ #define BTREE_CSUM_XOR 121107 -static void node_prepare_for_write(struct dm_block_validator *v, +static void node_prepare_for_write(const struct dm_block_validator *v, struct dm_block *b, size_t block_size) { @@ -29,7 +29,7 @@ static void node_prepare_for_write(struct dm_block_validator *v, BTREE_CSUM_XOR)); } -static int node_check(struct dm_block_validator *v, +static int node_check(const struct dm_block_validator *v, struct dm_block *b, size_t block_size) { @@ -81,7 +81,7 @@ static int node_check(struct dm_block_validator *v, return 0; } -struct dm_block_validator btree_node_validator = { +const struct dm_block_validator btree_node_validator = { .name = "btree_node", .prepare_for_write = node_prepare_for_write, .check = node_check diff --git a/drivers/md/persistent-data/dm-space-map-common.c b/drivers/md/persistent-data/dm-space-map-common.c index 591d1a43d035..3a19124ee279 100644 --- a/drivers/md/persistent-data/dm-space-map-common.c +++ b/drivers/md/persistent-data/dm-space-map-common.c @@ -22,7 +22,7 @@ */ #define INDEX_CSUM_XOR 160478 -static void index_prepare_for_write(struct dm_block_validator *v, +static void index_prepare_for_write(const struct dm_block_validator *v, struct dm_block *b, size_t block_size) { @@ -34,7 +34,7 @@ static void index_prepare_for_write(struct dm_block_validator *v, INDEX_CSUM_XOR)); } -static int index_check(struct dm_block_validator *v, +static int index_check(const struct dm_block_validator *v, struct dm_block *b, size_t block_size) { @@ -59,7 +59,7 @@ static int index_check(struct dm_block_validator *v, return 0; } -static struct dm_block_validator index_validator = { +static const struct dm_block_validator index_validator = { .name = "index", .prepare_for_write = index_prepare_for_write, .check = index_check @@ -72,7 +72,7 @@ static struct dm_block_validator index_validator = { */ #define BITMAP_CSUM_XOR 240779 -static void dm_bitmap_prepare_for_write(struct dm_block_validator *v, +static void dm_bitmap_prepare_for_write(const struct dm_block_validator *v, struct dm_block *b, size_t block_size) { @@ -84,7 +84,7 @@ static void dm_bitmap_prepare_for_write(struct dm_block_validator *v, BITMAP_CSUM_XOR)); } -static int dm_bitmap_check(struct dm_block_validator *v, +static int dm_bitmap_check(const struct dm_block_validator *v, struct dm_block *b, size_t block_size) { @@ -109,7 +109,7 @@ static int dm_bitmap_check(struct dm_block_validator *v, return 0; } -static struct dm_block_validator dm_sm_bitmap_validator = { +static const struct dm_block_validator dm_sm_bitmap_validator = { .name = "sm_bitmap", .prepare_for_write = dm_bitmap_prepare_for_write, .check = dm_bitmap_check, diff --git a/drivers/md/persistent-data/dm-transaction-manager.c b/drivers/md/persistent-data/dm-transaction-manager.c index c88fa6266203..c7ba4e6cbbc7 100644 --- a/drivers/md/persistent-data/dm-transaction-manager.c +++ b/drivers/md/persistent-data/dm-transaction-manager.c @@ -237,7 +237,7 @@ int dm_tm_commit(struct dm_transaction_manager *tm, struct dm_block *root) EXPORT_SYMBOL_GPL(dm_tm_commit); int dm_tm_new_block(struct dm_transaction_manager *tm, - struct dm_block_validator *v, + const struct dm_block_validator *v, struct dm_block **result) { int r; @@ -266,7 +266,7 @@ int dm_tm_new_block(struct dm_transaction_manager *tm, } static int __shadow_block(struct dm_transaction_manager *tm, dm_block_t orig, - struct dm_block_validator *v, + const struct dm_block_validator *v, struct dm_block **result) { int r; @@ -306,7 +306,7 @@ static int __shadow_block(struct dm_transaction_manager *tm, dm_block_t orig, } int dm_tm_shadow_block(struct dm_transaction_manager *tm, dm_block_t orig, - struct dm_block_validator *v, struct dm_block **result, + const struct dm_block_validator *v, struct dm_block **result, int *inc_children) { int r; @@ -331,7 +331,7 @@ int dm_tm_shadow_block(struct dm_transaction_manager *tm, dm_block_t orig, EXPORT_SYMBOL_GPL(dm_tm_shadow_block); int dm_tm_read_lock(struct dm_transaction_manager *tm, dm_block_t b, - struct dm_block_validator *v, + const struct dm_block_validator *v, struct dm_block **blk) { if (tm->is_clone) { diff --git a/drivers/md/persistent-data/dm-transaction-manager.h b/drivers/md/persistent-data/dm-transaction-manager.h index 01f7e650118d..61a8d10825ca 100644 --- a/drivers/md/persistent-data/dm-transaction-manager.h +++ b/drivers/md/persistent-data/dm-transaction-manager.h @@ -64,7 +64,7 @@ int dm_tm_commit(struct dm_transaction_manager *tm, struct dm_block *superblock) * Zeroes the new block and returns with write lock held. */ int dm_tm_new_block(struct dm_transaction_manager *tm, - struct dm_block_validator *v, + const struct dm_block_validator *v, struct dm_block **result); /* @@ -84,7 +84,7 @@ int dm_tm_new_block(struct dm_transaction_manager *tm, * it locked when you call this. */ int dm_tm_shadow_block(struct dm_transaction_manager *tm, dm_block_t orig, - struct dm_block_validator *v, + const struct dm_block_validator *v, struct dm_block **result, int *inc_children); /* @@ -92,7 +92,7 @@ int dm_tm_shadow_block(struct dm_transaction_manager *tm, dm_block_t orig, * on it outstanding then it'll block. */ int dm_tm_read_lock(struct dm_transaction_manager *tm, dm_block_t b, - struct dm_block_validator *v, + const struct dm_block_validator *v, struct dm_block **result); void dm_tm_unlock(struct dm_transaction_manager *tm, struct dm_block *b); diff --git a/drivers/md/raid0.c b/drivers/md/raid0.c index c5d4aeb68404..32d587524778 100644 --- a/drivers/md/raid0.c +++ b/drivers/md/raid0.c @@ -365,30 +365,30 @@ static sector_t raid0_size(struct mddev *mddev, sector_t sectors, int raid_disks return array_sectors; } -static void free_conf(struct mddev *mddev, struct r0conf *conf) -{ - kfree(conf->strip_zone); - kfree(conf->devlist); - kfree(conf); -} - static void raid0_free(struct mddev *mddev, void *priv) { struct r0conf *conf = priv; - free_conf(mddev, conf); + kfree(conf->strip_zone); + kfree(conf->devlist); + kfree(conf); } static int raid0_set_limits(struct mddev *mddev) { struct queue_limits lim; + int err; - blk_set_stacking_limits(&lim); + md_init_stacking_limits(&lim); lim.max_hw_sectors = mddev->chunk_sectors; lim.max_write_zeroes_sectors = mddev->chunk_sectors; lim.io_min = mddev->chunk_sectors << 9; lim.io_opt = lim.io_min * mddev->raid_disks; - mddev_stack_rdev_limits(mddev, &lim); + err = mddev_stack_rdev_limits(mddev, &lim, MDDEV_STACK_INTEGRITY); + if (err) { + queue_limits_cancel_update(mddev->gendisk->queue); + return err; + } return queue_limits_set(mddev->gendisk->queue, &lim); } @@ -415,7 +415,7 @@ static int raid0_run(struct mddev *mddev) if (!mddev_is_dm(mddev)) { ret = raid0_set_limits(mddev); if (ret) - goto out_free_conf; + return ret; } /* calculate array device size */ @@ -427,13 +427,7 @@ static int raid0_run(struct mddev *mddev) dump_zones(mddev); - ret = md_integrity_register(mddev); - if (ret) - goto out_free_conf; - return 0; -out_free_conf: - free_conf(mddev, conf); - return ret; + return md_integrity_register(mddev); } /* diff --git a/drivers/md/raid1.c b/drivers/md/raid1.c index 7b8a71ca66dd..7acfe7c9dc8d 100644 --- a/drivers/md/raid1.c +++ b/drivers/md/raid1.c @@ -680,6 +680,7 @@ static int choose_slow_rdev(struct r1conf *conf, struct r1bio *r1_bio, len = r1_bio->sectors; read_len = raid1_check_read_range(rdev, this_sector, &len); if (read_len == r1_bio->sectors) { + *max_sectors = read_len; update_read_sectors(conf, disk, this_sector, read_len); return disk; } @@ -1687,8 +1688,7 @@ static bool raid1_make_request(struct mddev *mddev, struct bio *bio) if (bio_data_dir(bio) == READ) raid1_read_request(mddev, bio, sectors, NULL); else { - if (!md_write_start(mddev,bio)) - return false; + md_write_start(mddev,bio); raid1_write_request(mddev, bio, sectors); } return true; @@ -1907,9 +1907,6 @@ static int raid1_add_disk(struct mddev *mddev, struct md_rdev *rdev) if (mddev->recovery_disabled == conf->recovery_disabled) return -EBUSY; - if (md_integrity_add_rdev(rdev, mddev)) - return -ENXIO; - if (rdev->raid_disk >= 0) first = last = rdev->raid_disk; @@ -2757,12 +2754,12 @@ static struct r1bio *raid1_alloc_init_r1buf(struct r1conf *conf) */ static sector_t raid1_sync_request(struct mddev *mddev, sector_t sector_nr, - int *skipped) + sector_t max_sector, int *skipped) { struct r1conf *conf = mddev->private; struct r1bio *r1_bio; struct bio *bio; - sector_t max_sector, nr_sectors; + sector_t nr_sectors; int disk = -1; int i; int wonly = -1; @@ -2778,7 +2775,6 @@ static sector_t raid1_sync_request(struct mddev *mddev, sector_t sector_nr, if (init_resync(conf)) return 0; - max_sector = mddev->dev_sectors; if (sector_nr >= max_sector) { /* If we aborted, we need to abort the * sync on the 'current' bitmap chunk (there will @@ -3197,14 +3193,18 @@ static struct r1conf *setup_conf(struct mddev *mddev) static int raid1_set_limits(struct mddev *mddev) { struct queue_limits lim; + int err; - blk_set_stacking_limits(&lim); + md_init_stacking_limits(&lim); lim.max_write_zeroes_sectors = 0; - mddev_stack_rdev_limits(mddev, &lim); + err = mddev_stack_rdev_limits(mddev, &lim, MDDEV_STACK_INTEGRITY); + if (err) { + queue_limits_cancel_update(mddev->gendisk->queue); + return err; + } return queue_limits_set(mddev->gendisk->queue, &lim); } -static void raid1_free(struct mddev *mddev, void *priv); static int raid1_run(struct mddev *mddev) { struct r1conf *conf; @@ -3238,7 +3238,7 @@ static int raid1_run(struct mddev *mddev) if (!mddev_is_dm(mddev)) { ret = raid1_set_limits(mddev); if (ret) - goto abort; + return ret; } mddev->degraded = 0; @@ -3252,8 +3252,7 @@ static int raid1_run(struct mddev *mddev) */ if (conf->raid_disks - mddev->degraded < 1) { md_unregister_thread(mddev, &conf->thread); - ret = -EINVAL; - goto abort; + return -EINVAL; } if (conf->raid_disks - mddev->degraded == 1) @@ -3277,14 +3276,8 @@ static int raid1_run(struct mddev *mddev) md_set_array_sectors(mddev, raid1_size(mddev, 0, 0)); ret = md_integrity_register(mddev); - if (ret) { + if (ret) md_unregister_thread(mddev, &mddev->thread); - goto abort; - } - return 0; - -abort: - raid1_free(mddev, conf); return ret; } diff --git a/drivers/md/raid10.c b/drivers/md/raid10.c index a4556d2e46bf..2a9c4ee982e0 100644 --- a/drivers/md/raid10.c +++ b/drivers/md/raid10.c @@ -1836,8 +1836,7 @@ static bool raid10_make_request(struct mddev *mddev, struct bio *bio) && md_flush_request(mddev, bio)) return true; - if (!md_write_start(mddev, bio)) - return false; + md_write_start(mddev, bio); if (unlikely(bio_op(bio) == REQ_OP_DISCARD)) if (!raid10_handle_discard(mddev, bio)) @@ -2083,9 +2082,6 @@ static int raid10_add_disk(struct mddev *mddev, struct md_rdev *rdev) if (rdev->saved_raid_disk < 0 && !_enough(conf, 1, -1)) return -EINVAL; - if (md_integrity_add_rdev(rdev, mddev)) - return -ENXIO; - if (rdev->raid_disk >= 0) first = last = rdev->raid_disk; @@ -3140,12 +3136,12 @@ static void raid10_set_cluster_sync_high(struct r10conf *conf) */ static sector_t raid10_sync_request(struct mddev *mddev, sector_t sector_nr, - int *skipped) + sector_t max_sector, int *skipped) { struct r10conf *conf = mddev->private; struct r10bio *r10_bio; struct bio *biolist = NULL, *bio; - sector_t max_sector, nr_sectors; + sector_t nr_sectors; int i; int max_sync; sector_t sync_blocks; @@ -3175,10 +3171,6 @@ static sector_t raid10_sync_request(struct mddev *mddev, sector_t sector_nr, return 0; skipped: - max_sector = mddev->dev_sectors; - if (test_bit(MD_RECOVERY_SYNC, &mddev->recovery) || - test_bit(MD_RECOVERY_RESHAPE, &mddev->recovery)) - max_sector = mddev->resync_max_sectors; if (sector_nr >= max_sector) { conf->cluster_sync_low = 0; conf->cluster_sync_high = 0; @@ -3980,12 +3972,17 @@ static int raid10_set_queue_limits(struct mddev *mddev) { struct r10conf *conf = mddev->private; struct queue_limits lim; + int err; - blk_set_stacking_limits(&lim); + md_init_stacking_limits(&lim); lim.max_write_zeroes_sectors = 0; lim.io_min = mddev->chunk_sectors << 9; lim.io_opt = lim.io_min * raid10_nr_stripes(conf); - mddev_stack_rdev_limits(mddev, &lim); + err = mddev_stack_rdev_limits(mddev, &lim, MDDEV_STACK_INTEGRITY); + if (err) { + queue_limits_cancel_update(mddev->gendisk->queue); + return err; + } return queue_limits_set(mddev->gendisk->queue, &lim); } diff --git a/drivers/md/raid5.c b/drivers/md/raid5.c index 2bd1ce9b3922..c14cf2410365 100644 --- a/drivers/md/raid5.c +++ b/drivers/md/raid5.c @@ -155,7 +155,7 @@ static int raid6_idx_to_slot(int idx, struct stripe_head *sh, return slot; } -static void print_raid5_conf (struct r5conf *conf); +static void print_raid5_conf(struct r5conf *conf); static int stripe_operations_active(struct stripe_head *sh) { @@ -5899,6 +5899,39 @@ out: return ret; } +enum reshape_loc { + LOC_NO_RESHAPE, + LOC_AHEAD_OF_RESHAPE, + LOC_INSIDE_RESHAPE, + LOC_BEHIND_RESHAPE, +}; + +static enum reshape_loc get_reshape_loc(struct mddev *mddev, + struct r5conf *conf, sector_t logical_sector) +{ + sector_t reshape_progress, reshape_safe; + /* + * Spinlock is needed as reshape_progress may be + * 64bit on a 32bit platform, and so it might be + * possible to see a half-updated value + * Of course reshape_progress could change after + * the lock is dropped, so once we get a reference + * to the stripe that we think it is, we will have + * to check again. + */ + spin_lock_irq(&conf->device_lock); + reshape_progress = conf->reshape_progress; + reshape_safe = conf->reshape_safe; + spin_unlock_irq(&conf->device_lock); + if (reshape_progress == MaxSector) + return LOC_NO_RESHAPE; + if (ahead_of_reshape(mddev, logical_sector, reshape_progress)) + return LOC_AHEAD_OF_RESHAPE; + if (ahead_of_reshape(mddev, logical_sector, reshape_safe)) + return LOC_INSIDE_RESHAPE; + return LOC_BEHIND_RESHAPE; +} + static enum stripe_result make_stripe_request(struct mddev *mddev, struct r5conf *conf, struct stripe_request_ctx *ctx, sector_t logical_sector, struct bio *bi) @@ -5913,28 +5946,14 @@ static enum stripe_result make_stripe_request(struct mddev *mddev, seq = read_seqcount_begin(&conf->gen_lock); if (unlikely(conf->reshape_progress != MaxSector)) { - /* - * Spinlock is needed as reshape_progress may be - * 64bit on a 32bit platform, and so it might be - * possible to see a half-updated value - * Of course reshape_progress could change after - * the lock is dropped, so once we get a reference - * to the stripe that we think it is, we will have - * to check again. - */ - spin_lock_irq(&conf->device_lock); - if (ahead_of_reshape(mddev, logical_sector, - conf->reshape_progress)) { - previous = 1; - } else { - if (ahead_of_reshape(mddev, logical_sector, - conf->reshape_safe)) { - spin_unlock_irq(&conf->device_lock); - ret = STRIPE_SCHEDULE_AND_RETRY; - goto out; - } + enum reshape_loc loc = get_reshape_loc(mddev, conf, + logical_sector); + if (loc == LOC_INSIDE_RESHAPE) { + ret = STRIPE_SCHEDULE_AND_RETRY; + goto out; } - spin_unlock_irq(&conf->device_lock); + if (loc == LOC_AHEAD_OF_RESHAPE) + previous = 1; } new_sector = raid5_compute_sector(conf, logical_sector, previous, @@ -6078,8 +6097,7 @@ static bool raid5_make_request(struct mddev *mddev, struct bio * bi) ctx.do_flush = bi->bi_opf & REQ_PREFLUSH; } - if (!md_write_start(mddev, bi)) - return false; + md_write_start(mddev, bi); /* * If array is degraded, better not do chunk aligned read because * later we might have to read it again in order to reconstruct @@ -6113,8 +6131,7 @@ static bool raid5_make_request(struct mddev *mddev, struct bio * bi) /* Bail out if conflicts with reshape and REQ_NOWAIT is set */ if ((bi->bi_opf & REQ_NOWAIT) && (conf->reshape_progress != MaxSector) && - !ahead_of_reshape(mddev, logical_sector, conf->reshape_progress) && - ahead_of_reshape(mddev, logical_sector, conf->reshape_safe)) { + get_reshape_loc(mddev, conf, logical_sector) == LOC_INSIDE_RESHAPE) { bio_wouldblock_error(bi); if (rw == WRITE) md_write_end(mddev); @@ -6255,7 +6272,9 @@ static sector_t reshape_request(struct mddev *mddev, sector_t sector_nr, int *sk safepos = conf->reshape_safe; sector_div(safepos, data_disks); if (mddev->reshape_backwards) { - BUG_ON(writepos < reshape_sectors); + if (WARN_ON(writepos < reshape_sectors)) + return MaxSector; + writepos -= reshape_sectors; readpos += reshape_sectors; safepos += reshape_sectors; @@ -6273,14 +6292,18 @@ static sector_t reshape_request(struct mddev *mddev, sector_t sector_nr, int *sk * to set 'stripe_addr' which is where we will write to. */ if (mddev->reshape_backwards) { - BUG_ON(conf->reshape_progress == 0); + if (WARN_ON(conf->reshape_progress == 0)) + return MaxSector; + stripe_addr = writepos; - BUG_ON((mddev->dev_sectors & - ~((sector_t)reshape_sectors - 1)) - - reshape_sectors - stripe_addr - != sector_nr); + if (WARN_ON((mddev->dev_sectors & + ~((sector_t)reshape_sectors - 1)) - + reshape_sectors - stripe_addr != sector_nr)) + return MaxSector; } else { - BUG_ON(writepos != sector_nr + reshape_sectors); + if (WARN_ON(writepos != sector_nr + reshape_sectors)) + return MaxSector; + stripe_addr = sector_nr; } @@ -6458,11 +6481,10 @@ ret: } static inline sector_t raid5_sync_request(struct mddev *mddev, sector_t sector_nr, - int *skipped) + sector_t max_sector, int *skipped) { struct r5conf *conf = mddev->private; struct stripe_head *sh; - sector_t max_sector = mddev->dev_sectors; sector_t sync_blocks; int still_degraded = 0; int i; @@ -7082,12 +7104,14 @@ raid5_store_skip_copy(struct mddev *mddev, const char *page, size_t len) err = -ENODEV; else if (new != conf->skip_copy) { struct request_queue *q = mddev->gendisk->queue; + struct queue_limits lim = queue_limits_start_update(q); conf->skip_copy = new; if (new) - blk_queue_flag_set(QUEUE_FLAG_STABLE_WRITES, q); + lim.features |= BLK_FEAT_STABLE_WRITES; else - blk_queue_flag_clear(QUEUE_FLAG_STABLE_WRITES, q); + lim.features &= ~BLK_FEAT_STABLE_WRITES; + err = queue_limits_commit_update(q, &lim); } mddev_unlock_and_resume(mddev); return err ?: len; @@ -7562,11 +7586,11 @@ static struct r5conf *setup_conf(struct mddev *mddev) if (test_bit(Replacement, &rdev->flags)) { if (disk->replacement) goto abort; - RCU_INIT_POINTER(disk->replacement, rdev); + disk->replacement = rdev; } else { if (disk->rdev) goto abort; - RCU_INIT_POINTER(disk->rdev, rdev); + disk->rdev = rdev; } if (test_bit(In_sync, &rdev->flags)) { @@ -7702,13 +7726,13 @@ static int raid5_set_limits(struct mddev *mddev) */ stripe = roundup_pow_of_two(data_disks * (mddev->chunk_sectors << 9)); - blk_set_stacking_limits(&lim); + md_init_stacking_limits(&lim); lim.io_min = mddev->chunk_sectors << 9; lim.io_opt = lim.io_min * (conf->raid_disks - conf->max_degraded); - lim.raid_partial_stripes_expensive = 1; + lim.features |= BLK_FEAT_RAID_PARTIAL_STRIPES_EXPENSIVE; lim.discard_granularity = stripe; lim.max_write_zeroes_sectors = 0; - mddev_stack_rdev_limits(mddev, &lim); + mddev_stack_rdev_limits(mddev, &lim, 0); rdev_for_each(rdev, mddev) queue_limits_stack_bdev(&lim, rdev->bdev, rdev->new_data_offset, mddev->gendisk->disk_name); @@ -8048,7 +8072,7 @@ static void raid5_status(struct seq_file *seq, struct mddev *mddev) seq_printf (seq, "]"); } -static void print_raid5_conf (struct r5conf *conf) +static void print_raid5_conf(struct r5conf *conf) { struct md_rdev *rdev; int i; @@ -8062,15 +8086,13 @@ static void print_raid5_conf (struct r5conf *conf) conf->raid_disks, conf->raid_disks - conf->mddev->degraded); - rcu_read_lock(); for (i = 0; i < conf->raid_disks; i++) { - rdev = rcu_dereference(conf->disks[i].rdev); + rdev = conf->disks[i].rdev; if (rdev) pr_debug(" disk %d, o:%d, dev:%pg\n", i, !test_bit(Faulty, &rdev->flags), rdev->bdev); } - rcu_read_unlock(); } static int raid5_spare_active(struct mddev *mddev) |