aboutsummaryrefslogtreecommitdiff
path: root/drivers/md
diff options
context:
space:
mode:
Diffstat (limited to 'drivers/md')
-rw-r--r--drivers/md/Kconfig11
-rw-r--r--drivers/md/Makefile1
-rw-r--r--drivers/md/bcache/Kconfig2
-rw-r--r--drivers/md/bcache/alloc.c5
-rw-r--r--drivers/md/bcache/bcache.h18
-rw-r--r--drivers/md/bcache/bset.c13
-rw-r--r--drivers/md/bcache/bset.h2
-rw-r--r--drivers/md/bcache/btree.c6
-rw-r--r--drivers/md/bcache/debug.c7
-rw-r--r--drivers/md/bcache/extents.c2
-rw-r--r--drivers/md/bcache/io.c12
-rw-r--r--drivers/md/bcache/request.c23
-rw-r--r--drivers/md/bcache/super.c195
-rw-r--r--drivers/md/bcache/sysfs.c54
-rw-r--r--drivers/md/bcache/util.c35
-rw-r--r--drivers/md/bcache/util.h5
-rw-r--r--drivers/md/bcache/writeback.c4
-rw-r--r--drivers/md/dm-bio-prison-v1.c15
-rw-r--r--drivers/md/dm-bio-prison-v2.c15
-rw-r--r--drivers/md/dm-bufio.c5
-rw-r--r--drivers/md/dm-cache-background-tracker.c2
-rw-r--r--drivers/md/dm-cache-policy-smq.c4
-rw-r--r--drivers/md/dm-cache-target.c82
-rw-r--r--drivers/md/dm-core.h38
-rw-r--r--drivers/md/dm-crypt.c82
-rw-r--r--drivers/md/dm-integrity.c35
-rw-r--r--drivers/md/dm-io.c31
-rw-r--r--drivers/md/dm-ioctl.c3
-rw-r--r--drivers/md/dm-kcopyd.c27
-rw-r--r--drivers/md/dm-linear.c16
-rw-r--r--drivers/md/dm-log-userspace-base.c19
-rw-r--r--drivers/md/dm-log-writes.c15
-rw-r--r--drivers/md/dm-mpath.c3
-rw-r--r--drivers/md/dm-raid.c4
-rw-r--r--drivers/md/dm-raid1.c10
-rw-r--r--drivers/md/dm-region-hash.c38
-rw-r--r--drivers/md/dm-rq.c4
-rw-r--r--drivers/md/dm-snap.c28
-rw-r--r--drivers/md/dm-stats.c4
-rw-r--r--drivers/md/dm-stripe.c21
-rw-r--r--drivers/md/dm-switch.c3
-rw-r--r--drivers/md/dm-table.c19
-rw-r--r--drivers/md/dm-thin-metadata.c9
-rw-r--r--drivers/md/dm-thin.c52
-rw-r--r--drivers/md/dm-verity-fec.c57
-rw-r--r--drivers/md/dm-verity-fec.h8
-rw-r--r--drivers/md/dm-verity-target.c5
-rw-r--r--drivers/md/dm-writecache.c2305
-rw-r--r--drivers/md/dm-zoned-target.c17
-rw-r--r--drivers/md/dm.c111
-rw-r--r--drivers/md/md-bitmap.c6
-rw-r--r--drivers/md/md-cluster.c6
-rw-r--r--drivers/md/md-faulty.c2
-rw-r--r--drivers/md/md-linear.c2
-rw-r--r--drivers/md/md-multipath.c20
-rw-r--r--drivers/md/md-multipath.h2
-rw-r--r--drivers/md/md.c241
-rw-r--r--drivers/md/md.h26
-rw-r--r--drivers/md/raid0.c15
-rw-r--r--drivers/md/raid1.c93
-rw-r--r--drivers/md/raid1.h6
-rw-r--r--drivers/md/raid10.c85
-rw-r--r--drivers/md/raid10.h6
-rw-r--r--drivers/md/raid5-cache.c43
-rw-r--r--drivers/md/raid5-ppl.c42
-rw-r--r--drivers/md/raid5.c39
-rw-r--r--drivers/md/raid5.h3
67 files changed, 3312 insertions, 807 deletions
diff --git a/drivers/md/Kconfig b/drivers/md/Kconfig
index edff083f7c4e..8b8c123cae66 100644
--- a/drivers/md/Kconfig
+++ b/drivers/md/Kconfig
@@ -334,6 +334,17 @@ config DM_CACHE_SMQ
of less memory utilization, improved performance and increased
adaptability in the face of changing workloads.
+config DM_WRITECACHE
+ tristate "Writecache target"
+ depends on BLK_DEV_DM
+ ---help---
+ The writecache target caches writes on persistent memory or SSD.
+ It is intended for databases or other programs that need extremely
+ low commit latency.
+
+ The writecache target doesn't cache reads because reads are supposed
+ to be cached in standard RAM.
+
config DM_ERA
tristate "Era target (EXPERIMENTAL)"
depends on BLK_DEV_DM
diff --git a/drivers/md/Makefile b/drivers/md/Makefile
index 63255f3ebd97..822f4e8753bc 100644
--- a/drivers/md/Makefile
+++ b/drivers/md/Makefile
@@ -67,6 +67,7 @@ obj-$(CONFIG_DM_ERA) += dm-era.o
obj-$(CONFIG_DM_LOG_WRITES) += dm-log-writes.o
obj-$(CONFIG_DM_INTEGRITY) += dm-integrity.o
obj-$(CONFIG_DM_ZONED) += dm-zoned.o
+obj-$(CONFIG_DM_WRITECACHE) += dm-writecache.o
ifeq ($(CONFIG_DM_UEVENT),y)
dm-mod-objs += dm-uevent.o
diff --git a/drivers/md/bcache/Kconfig b/drivers/md/bcache/Kconfig
index 4d200883c505..17bf109c58e9 100644
--- a/drivers/md/bcache/Kconfig
+++ b/drivers/md/bcache/Kconfig
@@ -5,7 +5,7 @@ config BCACHE
Allows a block device to be used as cache for other devices; uses
a btree for indexing and the layout is optimized for SSDs.
- See Documentation/bcache.txt for details.
+ See Documentation/admin-guide/bcache.rst for details.
config BCACHE_DEBUG
bool "Bcache debugging"
diff --git a/drivers/md/bcache/alloc.c b/drivers/md/bcache/alloc.c
index 004cc3cc6123..7fa2631b422c 100644
--- a/drivers/md/bcache/alloc.c
+++ b/drivers/md/bcache/alloc.c
@@ -290,7 +290,7 @@ do { \
if (kthread_should_stop() || \
test_bit(CACHE_SET_IO_DISABLE, &ca->set->flags)) { \
set_current_state(TASK_RUNNING); \
- return 0; \
+ goto out; \
} \
\
schedule(); \
@@ -378,6 +378,9 @@ retry_invalidate:
bch_prio_write(ca);
}
}
+out:
+ wait_for_kthread_stop();
+ return 0;
}
/* Allocation */
diff --git a/drivers/md/bcache/bcache.h b/drivers/md/bcache/bcache.h
index d338b7086013..d6bf294f3907 100644
--- a/drivers/md/bcache/bcache.h
+++ b/drivers/md/bcache/bcache.h
@@ -269,7 +269,7 @@ struct bcache_device {
atomic_t *stripe_sectors_dirty;
unsigned long *full_dirty_stripes;
- struct bio_set *bio_split;
+ struct bio_set bio_split;
unsigned data_csum:1;
@@ -345,6 +345,7 @@ struct cached_dev {
struct keybuf writeback_keys;
+ struct task_struct *status_update_thread;
/*
* Order the write-half of writeback operations strongly in dispatch
* order. (Maintain LBA order; don't allow reads completing out of
@@ -392,6 +393,9 @@ struct cached_dev {
#define DEFAULT_CACHED_DEV_ERROR_LIMIT 64
atomic_t io_errors;
unsigned error_limit;
+ unsigned offline_seconds;
+
+ char backing_dev_name[BDEVNAME_SIZE];
};
enum alloc_reserve {
@@ -464,6 +468,8 @@ struct cache {
atomic_long_t meta_sectors_written;
atomic_long_t btree_sectors_written;
atomic_long_t sectors_written;
+
+ char cache_dev_name[BDEVNAME_SIZE];
};
struct gc_stat {
@@ -524,9 +530,9 @@ struct cache_set {
struct closure sb_write;
struct semaphore sb_write_mutex;
- mempool_t *search;
- mempool_t *bio_meta;
- struct bio_set *bio_split;
+ mempool_t search;
+ mempool_t bio_meta;
+ struct bio_set bio_split;
/* For the btree cache */
struct shrinker shrink;
@@ -651,7 +657,7 @@ struct cache_set {
* A btree node on disk could have too many bsets for an iterator to fit
* on the stack - have to dynamically allocate them
*/
- mempool_t *fill_iter;
+ mempool_t fill_iter;
struct bset_sort_state sort;
@@ -952,8 +958,6 @@ void bch_prio_write(struct cache *);
void bch_write_bdev_super(struct cached_dev *, struct closure *);
extern struct workqueue_struct *bcache_wq;
-extern const char * const bch_cache_modes[];
-extern const char * const bch_stop_on_failure_modes[];
extern struct mutex bch_register_lock;
extern struct list_head bch_cache_sets;
diff --git a/drivers/md/bcache/bset.c b/drivers/md/bcache/bset.c
index 579c696a5fe0..f3403b45bc28 100644
--- a/drivers/md/bcache/bset.c
+++ b/drivers/md/bcache/bset.c
@@ -1118,8 +1118,7 @@ struct bkey *bch_btree_iter_next_filter(struct btree_iter *iter,
void bch_bset_sort_state_free(struct bset_sort_state *state)
{
- if (state->pool)
- mempool_destroy(state->pool);
+ mempool_exit(&state->pool);
}
int bch_bset_sort_state_init(struct bset_sort_state *state, unsigned page_order)
@@ -1129,11 +1128,7 @@ int bch_bset_sort_state_init(struct bset_sort_state *state, unsigned page_order)
state->page_order = page_order;
state->crit_factor = int_sqrt(1 << page_order);
- state->pool = mempool_create_page_pool(1, page_order);
- if (!state->pool)
- return -ENOMEM;
-
- return 0;
+ return mempool_init_page_pool(&state->pool, 1, page_order);
}
EXPORT_SYMBOL(bch_bset_sort_state_init);
@@ -1191,7 +1186,7 @@ static void __btree_sort(struct btree_keys *b, struct btree_iter *iter,
BUG_ON(order > state->page_order);
- outp = mempool_alloc(state->pool, GFP_NOIO);
+ outp = mempool_alloc(&state->pool, GFP_NOIO);
out = page_address(outp);
used_mempool = true;
order = state->page_order;
@@ -1220,7 +1215,7 @@ static void __btree_sort(struct btree_keys *b, struct btree_iter *iter,
}
if (used_mempool)
- mempool_free(virt_to_page(out), state->pool);
+ mempool_free(virt_to_page(out), &state->pool);
else
free_pages((unsigned long) out, order);
diff --git a/drivers/md/bcache/bset.h b/drivers/md/bcache/bset.h
index 0c24280f3b98..b867f2200495 100644
--- a/drivers/md/bcache/bset.h
+++ b/drivers/md/bcache/bset.h
@@ -347,7 +347,7 @@ static inline struct bkey *bch_bset_search(struct btree_keys *b,
/* Sorting */
struct bset_sort_state {
- mempool_t *pool;
+ mempool_t pool;
unsigned page_order;
unsigned crit_factor;
diff --git a/drivers/md/bcache/btree.c b/drivers/md/bcache/btree.c
index 17936b2dc7d6..547c9eedc2f4 100644
--- a/drivers/md/bcache/btree.c
+++ b/drivers/md/bcache/btree.c
@@ -18,7 +18,7 @@
* as keys are inserted we only sort the pages that have not yet been written.
* When garbage collection is run, we resort the entire node.
*
- * All configuration is done via sysfs; see Documentation/bcache.txt.
+ * All configuration is done via sysfs; see Documentation/admin-guide/bcache.rst.
*/
#include "bcache.h"
@@ -204,7 +204,7 @@ void bch_btree_node_read_done(struct btree *b)
struct bset *i = btree_bset_first(b);
struct btree_iter *iter;
- iter = mempool_alloc(b->c->fill_iter, GFP_NOIO);
+ iter = mempool_alloc(&b->c->fill_iter, GFP_NOIO);
iter->size = b->c->sb.bucket_size / b->c->sb.block_size;
iter->used = 0;
@@ -271,7 +271,7 @@ void bch_btree_node_read_done(struct btree *b)
bch_bset_init_next(&b->keys, write_block(b),
bset_magic(&b->c->sb));
out:
- mempool_free(iter, b->c->fill_iter);
+ mempool_free(iter, &b->c->fill_iter);
return;
err:
set_btree_node_io_error(b);
diff --git a/drivers/md/bcache/debug.c b/drivers/md/bcache/debug.c
index 028f7b386e01..d030ce3025a6 100644
--- a/drivers/md/bcache/debug.c
+++ b/drivers/md/bcache/debug.c
@@ -106,7 +106,6 @@ void bch_btree_verify(struct btree *b)
void bch_data_verify(struct cached_dev *dc, struct bio *bio)
{
- char name[BDEVNAME_SIZE];
struct bio *check;
struct bio_vec bv, cbv;
struct bvec_iter iter, citer = { 0 };
@@ -134,7 +133,7 @@ void bch_data_verify(struct cached_dev *dc, struct bio *bio)
bv.bv_len),
dc->disk.c,
"verify failed at dev %s sector %llu",
- bdevname(dc->bdev, name),
+ dc->backing_dev_name,
(uint64_t) bio->bi_iter.bi_sector);
kunmap_atomic(p1);
@@ -251,7 +250,9 @@ void bch_debug_exit(void)
int __init bch_debug_init(struct kobject *kobj)
{
- bcache_debug = debugfs_create_dir("bcache", NULL);
+ if (!IS_ENABLED(CONFIG_DEBUG_FS))
+ return 0;
+ bcache_debug = debugfs_create_dir("bcache", NULL);
return IS_ERR_OR_NULL(bcache_debug);
}
diff --git a/drivers/md/bcache/extents.c b/drivers/md/bcache/extents.c
index c334e6666461..1d096742eb41 100644
--- a/drivers/md/bcache/extents.c
+++ b/drivers/md/bcache/extents.c
@@ -18,7 +18,7 @@
* as keys are inserted we only sort the pages that have not yet been written.
* When garbage collection is run, we resort the entire node.
*
- * All configuration is done via sysfs; see Documentation/bcache.txt.
+ * All configuration is done via sysfs; see Documentation/admin-guide/bcache.rst.
*/
#include "bcache.h"
diff --git a/drivers/md/bcache/io.c b/drivers/md/bcache/io.c
index 7fac97ae036e..9612873afee2 100644
--- a/drivers/md/bcache/io.c
+++ b/drivers/md/bcache/io.c
@@ -17,12 +17,12 @@
void bch_bbio_free(struct bio *bio, struct cache_set *c)
{
struct bbio *b = container_of(bio, struct bbio, bio);
- mempool_free(b, c->bio_meta);
+ mempool_free(b, &c->bio_meta);
}
struct bio *bch_bbio_alloc(struct cache_set *c)
{
- struct bbio *b = mempool_alloc(c->bio_meta, GFP_NOIO);
+ struct bbio *b = mempool_alloc(&c->bio_meta, GFP_NOIO);
struct bio *bio = &b->bio;
bio_init(bio, bio->bi_inline_vecs, bucket_pages(c));
@@ -52,7 +52,6 @@ void bch_submit_bbio(struct bio *bio, struct cache_set *c,
/* IO errors */
void bch_count_backing_io_errors(struct cached_dev *dc, struct bio *bio)
{
- char buf[BDEVNAME_SIZE];
unsigned errors;
WARN_ONCE(!dc, "NULL pointer of struct cached_dev");
@@ -60,7 +59,7 @@ void bch_count_backing_io_errors(struct cached_dev *dc, struct bio *bio)
errors = atomic_add_return(1, &dc->io_errors);
if (errors < dc->error_limit)
pr_err("%s: IO error on backing device, unrecoverable",
- bio_devname(bio, buf));
+ dc->backing_dev_name);
else
bch_cached_dev_error(dc);
}
@@ -105,19 +104,18 @@ void bch_count_io_errors(struct cache *ca,
}
if (error) {
- char buf[BDEVNAME_SIZE];
unsigned errors = atomic_add_return(1 << IO_ERROR_SHIFT,
&ca->io_errors);
errors >>= IO_ERROR_SHIFT;
if (errors < ca->set->error_limit)
pr_err("%s: IO error on %s%s",
- bdevname(ca->bdev, buf), m,
+ ca->cache_dev_name, m,
is_read ? ", recovering." : ".");
else
bch_cache_set_error(ca->set,
"%s: too many IO errors %s",
- bdevname(ca->bdev, buf), m);
+ ca->cache_dev_name, m);
}
}
diff --git a/drivers/md/bcache/request.c b/drivers/md/bcache/request.c
index a65e3365eeb9..ae67f5fa8047 100644
--- a/drivers/md/bcache/request.c
+++ b/drivers/md/bcache/request.c
@@ -213,7 +213,7 @@ static void bch_data_insert_start(struct closure *cl)
do {
unsigned i;
struct bkey *k;
- struct bio_set *split = op->c->bio_split;
+ struct bio_set *split = &op->c->bio_split;
/* 1 for the device pointer and 1 for the chksum */
if (bch_keylist_realloc(&op->insert_keys,
@@ -548,7 +548,7 @@ static int cache_lookup_fn(struct btree_op *op, struct btree *b, struct bkey *k)
n = bio_next_split(bio, min_t(uint64_t, INT_MAX,
KEY_OFFSET(k) - bio->bi_iter.bi_sector),
- GFP_NOIO, s->d->bio_split);
+ GFP_NOIO, &s->d->bio_split);
bio_key = &container_of(n, struct bbio, bio)->key;
bch_bkey_copy_single_ptr(bio_key, k, ptr);
@@ -649,11 +649,8 @@ static void backing_request_endio(struct bio *bio)
*/
if (unlikely(s->iop.writeback &&
bio->bi_opf & REQ_PREFLUSH)) {
- char buf[BDEVNAME_SIZE];
-
- bio_devname(bio, buf);
pr_err("Can't flush %s: returned bi_status %i",
- buf, bio->bi_status);
+ dc->backing_dev_name, bio->bi_status);
} else {
/* set to orig_bio->bi_status in bio_complete() */
s->iop.status = bio->bi_status;
@@ -710,7 +707,7 @@ static void search_free(struct closure *cl)
bio_complete(s);
closure_debug_destroy(cl);
- mempool_free(s, s->d->c->search);
+ mempool_free(s, &s->d->c->search);
}
static inline struct search *search_alloc(struct bio *bio,
@@ -718,7 +715,7 @@ static inline struct search *search_alloc(struct bio *bio,
{
struct search *s;
- s = mempool_alloc(d->c->search, GFP_NOIO);
+ s = mempool_alloc(&d->c->search, GFP_NOIO);
closure_init(&s->cl, NULL);
do_bio_hook(s, bio, request_endio);
@@ -867,7 +864,7 @@ static int cached_dev_cache_miss(struct btree *b, struct search *s,
s->cache_missed = 1;
if (s->cache_miss || s->iop.bypass) {
- miss = bio_next_split(bio, sectors, GFP_NOIO, s->d->bio_split);
+ miss = bio_next_split(bio, sectors, GFP_NOIO, &s->d->bio_split);
ret = miss == bio ? MAP_DONE : MAP_CONTINUE;
goto out_submit;
}
@@ -890,14 +887,14 @@ static int cached_dev_cache_miss(struct btree *b, struct search *s,
s->iop.replace = true;
- miss = bio_next_split(bio, sectors, GFP_NOIO, s->d->bio_split);
+ miss = bio_next_split(bio, sectors, GFP_NOIO, &s->d->bio_split);
/* btree_search_recurse()'s btree iterator is no good anymore */
ret = miss == bio ? MAP_DONE : -EINTR;
cache_bio = bio_alloc_bioset(GFP_NOWAIT,
DIV_ROUND_UP(s->insert_bio_sectors, PAGE_SECTORS),
- dc->disk.bio_split);
+ &dc->disk.bio_split);
if (!cache_bio)
goto out_submit;
@@ -1011,7 +1008,7 @@ static void cached_dev_write(struct cached_dev *dc, struct search *s)
struct bio *flush;
flush = bio_alloc_bioset(GFP_NOIO, 0,
- dc->disk.bio_split);
+ &dc->disk.bio_split);
if (!flush) {
s->iop.status = BLK_STS_RESOURCE;
goto insert_data;
@@ -1024,7 +1021,7 @@ static void cached_dev_write(struct cached_dev *dc, struct search *s)
closure_bio_submit(s->iop.c, flush, cl);
}
} else {
- s->iop.bio = bio_clone_fast(bio, GFP_NOIO, dc->disk.bio_split);
+ s->iop.bio = bio_clone_fast(bio, GFP_NOIO, &dc->disk.bio_split);
/* I/O request sent to backing device */
bio->bi_end_io = backing_request_endio;
closure_bio_submit(s->iop.c, bio, cl);
diff --git a/drivers/md/bcache/super.c b/drivers/md/bcache/super.c
index d90d9e59ca00..fa4058e43202 100644
--- a/drivers/md/bcache/super.c
+++ b/drivers/md/bcache/super.c
@@ -37,24 +37,6 @@ static const char invalid_uuid[] = {
0xc8, 0x50, 0xfc, 0x5e, 0xcb, 0x16, 0xcd, 0x99
};
-/* Default is -1; we skip past it for struct cached_dev's cache mode */
-const char * const bch_cache_modes[] = {
- "default",
- "writethrough",
- "writeback",
- "writearound",
- "none",
- NULL
-};
-
-/* Default is -1; we skip past it for stop_when_cache_set_failed */
-const char * const bch_stop_on_failure_modes[] = {
- "default",
- "auto",
- "always",
- NULL
-};
-
static struct kobject *bcache_kobj;
struct mutex bch_register_lock;
LIST_HEAD(bch_cache_sets);
@@ -654,6 +636,11 @@ static int ioctl_dev(struct block_device *b, fmode_t mode,
unsigned int cmd, unsigned long arg)
{
struct bcache_device *d = b->bd_disk->private_data;
+ struct cached_dev *dc = container_of(d, struct cached_dev, disk);
+
+ if (dc->io_disable)
+ return -EIO;
+
return d->ioctl(d, mode, cmd, arg);
}
@@ -766,8 +753,7 @@ static void bcache_device_free(struct bcache_device *d)
put_disk(d->disk);
}
- if (d->bio_split)
- bioset_free(d->bio_split);
+ bioset_exit(&d->bio_split);
kvfree(d->full_dirty_stripes);
kvfree(d->stripe_sectors_dirty);
@@ -809,9 +795,8 @@ static int bcache_device_init(struct bcache_device *d, unsigned block_size,
if (idx < 0)
return idx;
- if (!(d->bio_split = bioset_create(4, offsetof(struct bbio, bio),
- BIOSET_NEED_BVECS |
- BIOSET_NEED_RESCUER)) ||
+ if (bioset_init(&d->bio_split, 4, offsetof(struct bbio, bio),
+ BIOSET_NEED_BVECS|BIOSET_NEED_RESCUER) ||
!(d->disk = alloc_disk(BCACHE_MINORS))) {
ida_simple_remove(&bcache_device_idx, idx);
return -ENOMEM;
@@ -864,6 +849,44 @@ static void calc_cached_dev_sectors(struct cache_set *c)
c->cached_dev_sectors = sectors;
}
+#define BACKING_DEV_OFFLINE_TIMEOUT 5
+static int cached_dev_status_update(void *arg)
+{
+ struct cached_dev *dc = arg;
+ struct request_queue *q;
+
+ /*
+ * If this delayed worker is stopping outside, directly quit here.
+ * dc->io_disable might be set via sysfs interface, so check it
+ * here too.
+ */
+ while (!kthread_should_stop() && !dc->io_disable) {
+ q = bdev_get_queue(dc->bdev);
+ if (blk_queue_dying(q))
+ dc->offline_seconds++;
+ else
+ dc->offline_seconds = 0;
+
+ if (dc->offline_seconds >= BACKING_DEV_OFFLINE_TIMEOUT) {
+ pr_err("%s: device offline for %d seconds",
+ dc->backing_dev_name,
+ BACKING_DEV_OFFLINE_TIMEOUT);
+ pr_err("%s: disable I/O request due to backing "
+ "device offline", dc->disk.name);
+ dc->io_disable = true;
+ /* let others know earlier that io_disable is true */
+ smp_mb();
+ bcache_device_stop(&dc->disk);
+ break;
+ }
+ schedule_timeout_interruptible(HZ);
+ }
+
+ wait_for_kthread_stop();
+ return 0;
+}
+
+
void bch_cached_dev_run(struct cached_dev *dc)
{
struct bcache_device *d = &dc->disk;
@@ -906,6 +929,14 @@ void bch_cached_dev_run(struct cached_dev *dc)
if (sysfs_create_link(&d->kobj, &disk_to_dev(d->disk)->kobj, "dev") ||
sysfs_create_link(&disk_to_dev(d->disk)->kobj, &d->kobj, "bcache"))
pr_debug("error creating sysfs link");
+
+ dc->status_update_thread = kthread_run(cached_dev_status_update,
+ dc, "bcache_status_update");
+ if (IS_ERR(dc->status_update_thread)) {
+ pr_warn("failed to create bcache_status_update kthread, "
+ "continue to run without monitoring backing "
+ "device status");
+ }
}
/*
@@ -936,7 +967,6 @@ static void cancel_writeback_rate_update_dwork(struct cached_dev *dc)
static void cached_dev_detach_finish(struct work_struct *w)
{
struct cached_dev *dc = container_of(w, struct cached_dev, detach);
- char buf[BDEVNAME_SIZE];
struct closure cl;
closure_init_stack(&cl);
@@ -967,7 +997,7 @@ static void cached_dev_detach_finish(struct work_struct *w)
mutex_unlock(&bch_register_lock);
- pr_info("Caching disabled for %s", bdevname(dc->bdev, buf));
+ pr_info("Caching disabled for %s", dc->backing_dev_name);
/* Drop ref we took in cached_dev_detach() */
closure_put(&dc->disk.cl);
@@ -999,29 +1029,28 @@ int bch_cached_dev_attach(struct cached_dev *dc, struct cache_set *c,
{
uint32_t rtime = cpu_to_le32(get_seconds());
struct uuid_entry *u;
- char buf[BDEVNAME_SIZE];
struct cached_dev *exist_dc, *t;
- bdevname(dc->bdev, buf);
-
if ((set_uuid && memcmp(set_uuid, c->sb.set_uuid, 16)) ||
(!set_uuid && memcmp(dc->sb.set_uuid, c->sb.set_uuid, 16)))
return -ENOENT;
if (dc->disk.c) {
- pr_err("Can't attach %s: already attached", buf);
+ pr_err("Can't attach %s: already attached",
+ dc->backing_dev_name);
return -EINVAL;
}
if (test_bit(CACHE_SET_STOPPING, &c->flags)) {
- pr_err("Can't attach %s: shutting down", buf);
+ pr_err("Can't attach %s: shutting down",
+ dc->backing_dev_name);
return -EINVAL;
}
if (dc->sb.block_size < c->sb.block_size) {
/* Will die */
pr_err("Couldn't attach %s: block size less than set's block size",
- buf);
+ dc->backing_dev_name);
return -EINVAL;
}
@@ -1029,7 +1058,7 @@ int bch_cached_dev_attach(struct cached_dev *dc, struct cache_set *c,
list_for_each_entry_safe(exist_dc, t, &c->cached_devs, list) {
if (!memcmp(dc->sb.uuid, exist_dc->sb.uuid, 16)) {
pr_err("Tried to attach %s but duplicate UUID already attached",
- buf);
+ dc->backing_dev_name);
return -EINVAL;
}
@@ -1047,13 +1076,15 @@ int bch_cached_dev_attach(struct cached_dev *dc, struct cache_set *c,
if (!u) {
if (BDEV_STATE(&dc->sb) == BDEV_STATE_DIRTY) {
- pr_err("Couldn't find uuid for %s in set", buf);
+ pr_err("Couldn't find uuid for %s in set",
+ dc->backing_dev_name);
return -ENOENT;
}
u = uuid_find_empty(c);
if (!u) {
- pr_err("Not caching %s, no room for UUID", buf);
+ pr_err("Not caching %s, no room for UUID",
+ dc->backing_dev_name);
return -EINVAL;
}
}
@@ -1112,7 +1143,8 @@ int bch_cached_dev_attach(struct cached_dev *dc, struct cache_set *c,
up_write(&dc->writeback_lock);
pr_info("Caching %s as %s on set %pU",
- bdevname(dc->bdev, buf), dc->disk.disk->disk_name,
+ dc->backing_dev_name,
+ dc->disk.disk->disk_name,
dc->disk.c->sb.set_uuid);
return 0;
}
@@ -1138,6 +1170,8 @@ static void cached_dev_free(struct closure *cl)
kthread_stop(dc->writeback_thread);
if (dc->writeback_write_wq)
destroy_workqueue(dc->writeback_write_wq);
+ if (!IS_ERR_OR_NULL(dc->status_update_thread))
+ kthread_stop(dc->status_update_thread);
if (atomic_read(&dc->running))
bd_unlink_disk_holder(dc->bdev, dc->disk.disk);
@@ -1225,10 +1259,10 @@ static void register_bdev(struct cache_sb *sb, struct page *sb_page,
struct block_device *bdev,
struct cached_dev *dc)
{
- char name[BDEVNAME_SIZE];
const char *err = "cannot allocate memory";
struct cache_set *c;
+ bdevname(bdev, dc->backing_dev_name);
memcpy(&dc->sb, sb, sizeof(struct cache_sb));
dc->bdev = bdev;
dc->bdev->bd_holder = dc;
@@ -1237,6 +1271,7 @@ static void register_bdev(struct cache_sb *sb, struct page *sb_page,
bio_first_bvec_all(&dc->sb_bio)->bv_page = sb_page;
get_page(sb_page);
+
if (cached_dev_init(dc, sb->block_size << 9))
goto err;
@@ -1247,7 +1282,7 @@ static void register_bdev(struct cache_sb *sb, struct page *sb_page,
if (bch_cache_accounting_add_kobjs(&dc->accounting, &dc->disk.kobj))
goto err;
- pr_info("registered backing device %s", bdevname(bdev, name));
+ pr_info("registered backing device %s", dc->backing_dev_name);
list_add(&dc->list, &uncached_devices);
list_for_each_entry(c, &bch_cache_sets, list)
@@ -1259,7 +1294,7 @@ static void register_bdev(struct cache_sb *sb, struct page *sb_page,
return;
err:
- pr_notice("error %s: %s", bdevname(bdev, name), err);
+ pr_notice("error %s: %s", dc->backing_dev_name, err);
bcache_device_stop(&dc->disk);
}
@@ -1367,7 +1402,7 @@ int bch_flash_dev_create(struct cache_set *c, uint64_t size)
bool bch_cached_dev_error(struct cached_dev *dc)
{
- char name[BDEVNAME_SIZE];
+ struct cache_set *c;
if (!dc || test_bit(BCACHE_DEV_CLOSING, &dc->disk.flags))
return false;
@@ -1377,7 +1412,22 @@ bool bch_cached_dev_error(struct cached_dev *dc)
smp_mb();
pr_err("stop %s: too many IO errors on backing device %s\n",
- dc->disk.disk->disk_name, bdevname(dc->bdev, name));
+ dc->disk.disk->disk_name, dc->backing_dev_name);
+
+ /*
+ * If the cached device is still attached to a cache set,
+ * even dc->io_disable is true and no more I/O requests
+ * accepted, cache device internal I/O (writeback scan or
+ * garbage collection) may still prevent bcache device from
+ * being stopped. So here CACHE_SET_IO_DISABLE should be
+ * set to c->flags too, to make the internal I/O to cache
+ * device rejected and stopped immediately.
+ * If c is NULL, that means the bcache device is not attached
+ * to any cache set, then no CACHE_SET_IO_DISABLE bit to set.
+ */
+ c = dc->disk.c;
+ if (c && test_and_set_bit(CACHE_SET_IO_DISABLE, &c->flags))
+ pr_info("CACHE_SET_IO_DISABLE already set");
bcache_device_stop(&dc->disk);
return true;
@@ -1395,7 +1445,7 @@ bool bch_cache_set_error(struct cache_set *c, const char *fmt, ...)
return false;
if (test_and_set_bit(CACHE_SET_IO_DISABLE, &c->flags))
- pr_warn("CACHE_SET_IO_DISABLE already set");
+ pr_info("CACHE_SET_IO_DISABLE already set");
/* XXX: we can be called from atomic context
acquire_console_sem();
@@ -1448,14 +1498,10 @@ static void cache_set_free(struct closure *cl)
if (c->moving_gc_wq)
destroy_workqueue(c->moving_gc_wq);
- if (c->bio_split)
- bioset_free(c->bio_split);
- if (c->fill_iter)
- mempool_destroy(c->fill_iter);
- if (c->bio_meta)
- mempool_destroy(c->bio_meta);
- if (c->search)
- mempool_destroy(c->search);
+ bioset_exit(&c->bio_split);
+ mempool_exit(&c->fill_iter);
+ mempool_exit(&c->bio_meta);
+ mempool_exit(&c->search);
kfree(c->devices);
mutex_lock(&bch_register_lock);
@@ -1539,6 +1585,20 @@ static void conditional_stop_bcache_device(struct cache_set *c,
*/
pr_warn("stop_when_cache_set_failed of %s is \"auto\" and cache is dirty, stop it to avoid potential data corruption.",
d->disk->disk_name);
+ /*
+ * There might be a small time gap that cache set is
+ * released but bcache device is not. Inside this time
+ * gap, regular I/O requests will directly go into
+ * backing device as no cache set attached to. This
+ * behavior may also introduce potential inconsistence
+ * data in writeback mode while cache is dirty.
+ * Therefore before calling bcache_device_stop() due
+ * to a broken cache device, dc->io_disable should be
+ * explicitly set to true.
+ */
+ dc->io_disable = true;
+ /* make others know io_disable is true earlier */
+ smp_mb();
bcache_device_stop(d);
} else {
/*
@@ -1652,21 +1712,17 @@ struct cache_set *bch_cache_set_alloc(struct cache_sb *sb)
INIT_LIST_HEAD(&c->btree_cache_freed);
INIT_LIST_HEAD(&c->data_buckets);
- c->search = mempool_create_slab_pool(32, bch_search_cache);
- if (!c->search)
- goto err;
-
iter_size = (sb->bucket_size / sb->block_size + 1) *
sizeof(struct btree_iter_set);
- if (!(c->devices = kzalloc(c->nr_uuids * sizeof(void *), GFP_KERNEL)) ||
- !(c->bio_meta = mempool_create_kmalloc_pool(2,
- sizeof(struct bbio) + sizeof(struct bio_vec) *
- bucket_pages(c))) ||
- !(c->fill_iter = mempool_create_kmalloc_pool(1, iter_size)) ||
- !(c->bio_split = bioset_create(4, offsetof(struct bbio, bio),
- BIOSET_NEED_BVECS |
- BIOSET_NEED_RESCUER)) ||
+ if (!(c->devices = kcalloc(c->nr_uuids, sizeof(void *), GFP_KERNEL)) ||
+ mempool_init_slab_pool(&c->search, 32, bch_search_cache) ||
+ mempool_init_kmalloc_pool(&c->bio_meta, 2,
+ sizeof(struct bbio) + sizeof(struct bio_vec) *
+ bucket_pages(c)) ||
+ mempool_init_kmalloc_pool(&c->fill_iter, 1, iter_size) ||
+ bioset_init(&c->bio_split, 4, offsetof(struct bbio, bio),
+ BIOSET_NEED_BVECS|BIOSET_NEED_RESCUER) ||
!(c->uuids = alloc_bucket_pages(GFP_KERNEL, c)) ||
!(c->moving_gc_wq = alloc_workqueue("bcache_gc",
WQ_MEM_RECLAIM, 0)) ||
@@ -1985,10 +2041,11 @@ static int cache_alloc(struct cache *ca)
!init_fifo(&ca->free[RESERVE_NONE], free, GFP_KERNEL) ||
!init_fifo(&ca->free_inc, free << 2, GFP_KERNEL) ||
!init_heap(&ca->heap, free << 3, GFP_KERNEL) ||
- !(ca->buckets = vzalloc(sizeof(struct bucket) *
- ca->sb.nbuckets)) ||
- !(ca->prio_buckets = kzalloc(sizeof(uint64_t) * prio_buckets(ca) *
- 2, GFP_KERNEL)) ||
+ !(ca->buckets = vzalloc(array_size(sizeof(struct bucket),
+ ca->sb.nbuckets))) ||
+ !(ca->prio_buckets = kzalloc(array3_size(sizeof(uint64_t),
+ prio_buckets(ca), 2),
+ GFP_KERNEL)) ||
!(ca->disk_buckets = alloc_bucket_pages(GFP_KERNEL, ca)))
return -ENOMEM;
@@ -2003,12 +2060,10 @@ static int cache_alloc(struct cache *ca)
static int register_cache(struct cache_sb *sb, struct page *sb_page,
struct block_device *bdev, struct cache *ca)
{
- char name[BDEVNAME_SIZE];
const char *err = NULL; /* must be set for any error case */
int ret = 0;
- bdevname(bdev, name);
-
+ bdevname(bdev, ca->cache_dev_name);
memcpy(&ca->sb, sb, sizeof(struct cache_sb));
ca->bdev = bdev;
ca->bdev->bd_holder = ca;
@@ -2045,14 +2100,14 @@ static int register_cache(struct cache_sb *sb, struct page *sb_page,
goto out;
}
- pr_info("registered cache device %s", name);
+ pr_info("registered cache device %s", ca->cache_dev_name);
out:
kobject_put(&ca->kobj);
err:
if (err)
- pr_notice("error %s: %s", name, err);
+ pr_notice("error %s: %s", ca->cache_dev_name, err);
return ret;
}
diff --git a/drivers/md/bcache/sysfs.c b/drivers/md/bcache/sysfs.c
index dfeef583ee50..225b15aa0340 100644
--- a/drivers/md/bcache/sysfs.c
+++ b/drivers/md/bcache/sysfs.c
@@ -16,6 +16,22 @@
#include <linux/sort.h>
#include <linux/sched/clock.h>
+/* Default is -1; we skip past it for struct cached_dev's cache mode */
+static const char * const bch_cache_modes[] = {
+ "writethrough",
+ "writeback",
+ "writearound",
+ "none",
+ NULL
+};
+
+/* Default is -1; we skip past it for stop_when_cache_set_failed */
+static const char * const bch_stop_on_failure_modes[] = {
+ "auto",
+ "always",
+ NULL
+};
+
static const char * const cache_replacement_policies[] = {
"lru",
"fifo",
@@ -114,6 +130,20 @@ rw_attribute(btree_shrinker_disabled);
rw_attribute(copy_gc_enabled);
rw_attribute(size);
+static ssize_t bch_snprint_string_list(char *buf, size_t size, const char * const list[],
+ size_t selected)
+{
+ char *out = buf;
+ size_t i;
+
+ for (i = 0; list[i]; i++)
+ out += snprintf(out, buf + size - out,
+ i == selected ? "[%s] " : "%s ", list[i]);
+
+ out[-1] = '\n';
+ return out - buf;
+}
+
SHOW(__bch_cached_dev)
{
struct cached_dev *dc = container_of(kobj, struct cached_dev,
@@ -124,12 +154,12 @@ SHOW(__bch_cached_dev)
if (attr == &sysfs_cache_mode)
return bch_snprint_string_list(buf, PAGE_SIZE,
- bch_cache_modes + 1,
+ bch_cache_modes,
BDEV_CACHE_MODE(&dc->sb));
if (attr == &sysfs_stop_when_cache_set_failed)
return bch_snprint_string_list(buf, PAGE_SIZE,
- bch_stop_on_failure_modes + 1,
+ bch_stop_on_failure_modes,
dc->stop_when_cache_set_failed);
@@ -253,8 +283,7 @@ STORE(__cached_dev)
bch_cached_dev_run(dc);
if (attr == &sysfs_cache_mode) {
- v = bch_read_string_list(buf, bch_cache_modes + 1);
-
+ v = __sysfs_match_string(bch_cache_modes, -1, buf);
if (v < 0)
return v;
@@ -265,8 +294,7 @@ STORE(__cached_dev)
}
if (attr == &sysfs_stop_when_cache_set_failed) {
- v = bch_read_string_list(buf, bch_stop_on_failure_modes + 1);
-
+ v = __sysfs_match_string(bch_stop_on_failure_modes, -1, buf);
if (v < 0)
return v;
@@ -635,6 +663,7 @@ SHOW_LOCKED(bch_cache_set)
STORE(__bch_cache_set)
{
struct cache_set *c = container_of(kobj, struct cache_set, kobj);
+ ssize_t v;
if (attr == &sysfs_unregister)
bch_cache_set_unregister(c);
@@ -698,8 +727,7 @@ STORE(__bch_cache_set)
c->congested_write_threshold_us);
if (attr == &sysfs_errors) {
- ssize_t v = bch_read_string_list(buf, error_actions);
-
+ v = __sysfs_match_string(error_actions, -1, buf);
if (v < 0)
return v;
@@ -714,8 +742,7 @@ STORE(__bch_cache_set)
c->error_decay = strtoul_or_return(buf) / 88;
if (attr == &sysfs_io_disable) {
- int v = strtoul_or_return(buf);
-
+ v = strtoul_or_return(buf);
if (v) {
if (test_and_set_bit(CACHE_SET_IO_DISABLE,
&c->flags))
@@ -854,7 +881,8 @@ SHOW(__bch_cache)
uint16_t q[31], *p, *cached;
ssize_t ret;
- cached = p = vmalloc(ca->sb.nbuckets * sizeof(uint16_t));
+ cached = p = vmalloc(array_size(sizeof(uint16_t),
+ ca->sb.nbuckets));
if (!p)
return -ENOMEM;
@@ -929,6 +957,7 @@ SHOW_LOCKED(bch_cache)
STORE(__bch_cache)
{
struct cache *ca = container_of(kobj, struct cache, kobj);
+ ssize_t v;
if (attr == &sysfs_discard) {
bool v = strtoul_or_return(buf);
@@ -943,8 +972,7 @@ STORE(__bch_cache)
}
if (attr == &sysfs_cache_replacement_policy) {
- ssize_t v = bch_read_string_list(buf, cache_replacement_policies);
-
+ v = __sysfs_match_string(cache_replacement_policies, -1, buf);
if (v < 0)
return v;
diff --git a/drivers/md/bcache/util.c b/drivers/md/bcache/util.c
index 74febd5230df..fc479b026d6d 100644
--- a/drivers/md/bcache/util.c
+++ b/drivers/md/bcache/util.c
@@ -120,41 +120,6 @@ ssize_t bch_hprint(char *buf, int64_t v)
return sprintf(buf, "%llu.%i%c", q, t * 10 / 1024, units[u]);
}
-ssize_t bch_snprint_string_list(char *buf, size_t size, const char * const list[],
- size_t selected)
-{
- char *out = buf;
- size_t i;
-
- for (i = 0; list[i]; i++)
- out += snprintf(out, buf + size - out,
- i == selected ? "[%s] " : "%s ", list[i]);
-
- out[-1] = '\n';
- return out - buf;
-}
-
-ssize_t bch_read_string_list(const char *buf, const char * const list[])
-{
- size_t i;
- char *s, *d = kstrndup(buf, PAGE_SIZE - 1, GFP_KERNEL);
- if (!d)
- return -ENOMEM;
-
- s = strim(d);
-
- for (i = 0; list[i]; i++)
- if (!strcmp(list[i], s))
- break;
-
- kfree(d);
-
- if (!list[i])
- return -EINVAL;
-
- return i;
-}
-
bool bch_is_zero(const char *p, size_t n)
{
size_t i;
diff --git a/drivers/md/bcache/util.h b/drivers/md/bcache/util.h
index 268024529edd..cced87f8eb27 100644
--- a/drivers/md/bcache/util.h
+++ b/drivers/md/bcache/util.h
@@ -365,11 +365,6 @@ ssize_t bch_hprint(char *buf, int64_t v);
bool bch_is_zero(const char *p, size_t n);
int bch_parse_uuid(const char *s, char *uuid);
-ssize_t bch_snprint_string_list(char *buf, size_t size, const char * const list[],
- size_t selected);
-
-ssize_t bch_read_string_list(const char *buf, const char * const list[]);
-
struct time_stats {
spinlock_t lock;
/*
diff --git a/drivers/md/bcache/writeback.c b/drivers/md/bcache/writeback.c
index 4a9547cdcdc5..ad45ebe1a74b 100644
--- a/drivers/md/bcache/writeback.c
+++ b/drivers/md/bcache/writeback.c
@@ -244,8 +244,10 @@ static void dirty_endio(struct bio *bio)
struct keybuf_key *w = bio->bi_private;
struct dirty_io *io = w->private;
- if (bio->bi_status)
+ if (bio->bi_status) {
SET_KEY_DIRTY(&w->key, false);
+ bch_count_backing_io_errors(io->dc, bio);
+ }
closure_put(&io->cl);
}
diff --git a/drivers/md/dm-bio-prison-v1.c b/drivers/md/dm-bio-prison-v1.c
index 874841f0fc83..b5389890bbc3 100644
--- a/drivers/md/dm-bio-prison-v1.c
+++ b/drivers/md/dm-bio-prison-v1.c
@@ -19,8 +19,8 @@
struct dm_bio_prison {
spinlock_t lock;
- mempool_t *cell_pool;
struct rb_root cells;
+ mempool_t cell_pool;
};
static struct kmem_cache *_cell_cache;
@@ -33,15 +33,16 @@ static struct kmem_cache *_cell_cache;
*/
struct dm_bio_prison *dm_bio_prison_create(void)
{
- struct dm_bio_prison *prison = kmalloc(sizeof(*prison), GFP_KERNEL);
+ struct dm_bio_prison *prison = kzalloc(sizeof(*prison), GFP_KERNEL);
+ int ret;
if (!prison)
return NULL;
spin_lock_init(&prison->lock);
- prison->cell_pool = mempool_create_slab_pool(MIN_CELLS, _cell_cache);
- if (!prison->cell_pool) {
+ ret = mempool_init_slab_pool(&prison->cell_pool, MIN_CELLS, _cell_cache);
+ if (ret) {
kfree(prison);
return NULL;
}
@@ -54,21 +55,21 @@ EXPORT_SYMBOL_GPL(dm_bio_prison_create);
void dm_bio_prison_destroy(struct dm_bio_prison *prison)
{
- mempool_destroy(prison->cell_pool);
+ mempool_exit(&prison->cell_pool);
kfree(prison);
}
EXPORT_SYMBOL_GPL(dm_bio_prison_destroy);
struct dm_bio_prison_cell *dm_bio_prison_alloc_cell(struct dm_bio_prison *prison, gfp_t gfp)
{
- return mempool_alloc(prison->cell_pool, gfp);
+ return mempool_alloc(&prison->cell_pool, gfp);
}
EXPORT_SYMBOL_GPL(dm_bio_prison_alloc_cell);
void dm_bio_prison_free_cell(struct dm_bio_prison *prison,
struct dm_bio_prison_cell *cell)
{
- mempool_free(cell, prison->cell_pool);
+ mempool_free(cell, &prison->cell_pool);
}
EXPORT_SYMBOL_GPL(dm_bio_prison_free_cell);
diff --git a/drivers/md/dm-bio-prison-v2.c b/drivers/md/dm-bio-prison-v2.c
index 8ce3a1a588cf..b092cdc8e1ae 100644
--- a/drivers/md/dm-bio-prison-v2.c
+++ b/drivers/md/dm-bio-prison-v2.c
@@ -21,8 +21,8 @@ struct dm_bio_prison_v2 {
struct workqueue_struct *wq;
spinlock_t lock;
- mempool_t *cell_pool;
struct rb_root cells;
+ mempool_t cell_pool;
};
static struct kmem_cache *_cell_cache;
@@ -35,7 +35,8 @@ static struct kmem_cache *_cell_cache;
*/
struct dm_bio_prison_v2 *dm_bio_prison_create_v2(struct workqueue_struct *wq)
{
- struct dm_bio_prison_v2 *prison = kmalloc(sizeof(*prison), GFP_KERNEL);
+ struct dm_bio_prison_v2 *prison = kzalloc(sizeof(*prison), GFP_KERNEL);
+ int ret;
if (!prison)
return NULL;
@@ -43,8 +44,8 @@ struct dm_bio_prison_v2 *dm_bio_prison_create_v2(struct workqueue_struct *wq)
prison->wq = wq;
spin_lock_init(&prison->lock);
- prison->cell_pool = mempool_create_slab_pool(MIN_CELLS, _cell_cache);
- if (!prison->cell_pool) {
+ ret = mempool_init_slab_pool(&prison->cell_pool, MIN_CELLS, _cell_cache);
+ if (ret) {
kfree(prison);
return NULL;
}
@@ -57,21 +58,21 @@ EXPORT_SYMBOL_GPL(dm_bio_prison_create_v2);
void dm_bio_prison_destroy_v2(struct dm_bio_prison_v2 *prison)
{
- mempool_destroy(prison->cell_pool);
+ mempool_exit(&prison->cell_pool);
kfree(prison);
}
EXPORT_SYMBOL_GPL(dm_bio_prison_destroy_v2);
struct dm_bio_prison_cell_v2 *dm_bio_prison_alloc_cell_v2(struct dm_bio_prison_v2 *prison, gfp_t gfp)
{
- return mempool_alloc(prison->cell_pool, gfp);
+ return mempool_alloc(&prison->cell_pool, gfp);
}
EXPORT_SYMBOL_GPL(dm_bio_prison_alloc_cell_v2);
void dm_bio_prison_free_cell_v2(struct dm_bio_prison_v2 *prison,
struct dm_bio_prison_cell_v2 *cell)
{
- mempool_free(cell, prison->cell_pool);
+ mempool_free(cell, &prison->cell_pool);
}
EXPORT_SYMBOL_GPL(dm_bio_prison_free_cell_v2);
diff --git a/drivers/md/dm-bufio.c b/drivers/md/dm-bufio.c
index 12aa9ca21d8c..dc385b70e4c3 100644
--- a/drivers/md/dm-bufio.c
+++ b/drivers/md/dm-bufio.c
@@ -1681,8 +1681,9 @@ struct dm_bufio_client *dm_bufio_client_create(struct block_device *bdev, unsign
if (block_size <= KMALLOC_MAX_SIZE &&
(block_size < PAGE_SIZE || !is_power_of_2(block_size))) {
- snprintf(slab_name, sizeof slab_name, "dm_bufio_cache-%u", c->block_size);
- c->slab_cache = kmem_cache_create(slab_name, c->block_size, ARCH_KMALLOC_MINALIGN,
+ unsigned align = min(1U << __ffs(block_size), (unsigned)PAGE_SIZE);
+ snprintf(slab_name, sizeof slab_name, "dm_bufio_cache-%u", block_size);
+ c->slab_cache = kmem_cache_create(slab_name, block_size, align,
SLAB_RECLAIM_ACCOUNT, NULL);
if (!c->slab_cache) {
r = -ENOMEM;
diff --git a/drivers/md/dm-cache-background-tracker.c b/drivers/md/dm-cache-background-tracker.c
index 1d0af0a21fc7..84814e819e4c 100644
--- a/drivers/md/dm-cache-background-tracker.c
+++ b/drivers/md/dm-cache-background-tracker.c
@@ -166,7 +166,7 @@ static bool max_work_reached(struct background_tracker *b)
atomic_read(&b->pending_demotes) >= b->max_work;
}
-struct bt_work *alloc_work(struct background_tracker *b)
+static struct bt_work *alloc_work(struct background_tracker *b)
{
if (max_work_reached(b))
return NULL;
diff --git a/drivers/md/dm-cache-policy-smq.c b/drivers/md/dm-cache-policy-smq.c
index 4ab23d0075f6..1b5b9ad9e492 100644
--- a/drivers/md/dm-cache-policy-smq.c
+++ b/drivers/md/dm-cache-policy-smq.c
@@ -69,7 +69,7 @@ static int space_init(struct entry_space *es, unsigned nr_entries)
return 0;
}
- es->begin = vzalloc(sizeof(struct entry) * nr_entries);
+ es->begin = vzalloc(array_size(nr_entries, sizeof(struct entry)));
if (!es->begin)
return -ENOMEM;
@@ -588,7 +588,7 @@ static int h_init(struct smq_hash_table *ht, struct entry_space *es, unsigned nr
nr_buckets = roundup_pow_of_two(max(nr_entries / 4u, 16u));
ht->hash_bits = __ffs(nr_buckets);
- ht->buckets = vmalloc(sizeof(*ht->buckets) * nr_buckets);
+ ht->buckets = vmalloc(array_size(nr_buckets, sizeof(*ht->buckets)));
if (!ht->buckets)
return -ENOMEM;
diff --git a/drivers/md/dm-cache-target.c b/drivers/md/dm-cache-target.c
index da208638fba4..ce14a3d1f609 100644
--- a/drivers/md/dm-cache-target.c
+++ b/drivers/md/dm-cache-target.c
@@ -371,7 +371,13 @@ struct cache_stats {
struct cache {
struct dm_target *ti;
- struct dm_target_callbacks callbacks;
+ spinlock_t lock;
+
+ /*
+ * Fields for converting from sectors to blocks.
+ */
+ int sectors_per_block_shift;
+ sector_t sectors_per_block;
struct dm_cache_metadata *cmd;
@@ -402,13 +408,11 @@ struct cache {
dm_cblock_t cache_size;
/*
- * Fields for converting from sectors to blocks.
+ * Invalidation fields.
*/
- sector_t sectors_per_block;
- int sectors_per_block_shift;
+ spinlock_t invalidation_lock;
+ struct list_head invalidation_requests;
- spinlock_t lock;
- struct bio_list deferred_bios;
sector_t migration_threshold;
wait_queue_head_t migration_wait;
atomic_t nr_allocated_migrations;
@@ -419,13 +423,11 @@ struct cache {
*/
atomic_t nr_io_migrations;
+ struct bio_list deferred_bios;
+
struct rw_semaphore quiesce_lock;
- /*
- * cache_size entries, dirty if set
- */
- atomic_t nr_dirty;
- unsigned long *dirty_bitset;
+ struct dm_target_callbacks callbacks;
/*
* origin_blocks entries, discarded if set.
@@ -442,17 +444,27 @@ struct cache {
const char **ctr_args;
struct dm_kcopyd_client *copier;
- struct workqueue_struct *wq;
struct work_struct deferred_bio_worker;
struct work_struct migration_worker;
+ struct workqueue_struct *wq;
struct delayed_work waker;
struct dm_bio_prison_v2 *prison;
- struct bio_set *bs;
- mempool_t *migration_pool;
+ /*
+ * cache_size entries, dirty if set
+ */
+ unsigned long *dirty_bitset;
+ atomic_t nr_dirty;
- struct dm_cache_policy *policy;
unsigned policy_nr_args;
+ struct dm_cache_policy *policy;
+
+ /*
+ * Cache features such as write-through.
+ */
+ struct cache_features features;
+
+ struct cache_stats stats;
bool need_tick_bio:1;
bool sized:1;
@@ -461,25 +473,16 @@ struct cache {
bool loaded_mappings:1;
bool loaded_discards:1;
- /*
- * Cache features such as write-through.
- */
- struct cache_features features;
-
- struct cache_stats stats;
+ struct rw_semaphore background_work_lock;
- /*
- * Invalidation fields.
- */
- spinlock_t invalidation_lock;
- struct list_head invalidation_requests;
+ struct batcher committer;
+ struct work_struct commit_ws;
struct io_tracker tracker;
- struct work_struct commit_ws;
- struct batcher committer;
+ mempool_t migration_pool;
- struct rw_semaphore background_work_lock;
+ struct bio_set bs;
};
struct per_bio_data {
@@ -550,7 +553,7 @@ static struct dm_cache_migration *alloc_migration(struct cache *cache)
{
struct dm_cache_migration *mg;
- mg = mempool_alloc(cache->migration_pool, GFP_NOWAIT);
+ mg = mempool_alloc(&cache->migration_pool, GFP_NOWAIT);
if (!mg)
return NULL;
@@ -569,7 +572,7 @@ static void free_migration(struct dm_cache_migration *mg)
if (atomic_dec_and_test(&cache->nr_allocated_migrations))
wake_up(&cache->migration_wait);
- mempool_free(mg, cache->migration_pool);
+ mempool_free(mg, &cache->migration_pool);
}
/*----------------------------------------------------------------*/
@@ -924,7 +927,7 @@ static void issue_op(struct bio *bio, void *context)
static void remap_to_origin_and_cache(struct cache *cache, struct bio *bio,
dm_oblock_t oblock, dm_cblock_t cblock)
{
- struct bio *origin_bio = bio_clone_fast(bio, GFP_NOIO, cache->bs);
+ struct bio *origin_bio = bio_clone_fast(bio, GFP_NOIO, &cache->bs);
BUG_ON(!origin_bio);
@@ -2011,7 +2014,7 @@ static void destroy(struct cache *cache)
{
unsigned i;
- mempool_destroy(cache->migration_pool);
+ mempool_exit(&cache->migration_pool);
if (cache->prison)
dm_bio_prison_destroy_v2(cache->prison);
@@ -2047,8 +2050,7 @@ static void destroy(struct cache *cache)
kfree(cache->ctr_args[i]);
kfree(cache->ctr_args);
- if (cache->bs)
- bioset_free(cache->bs);
+ bioset_exit(&cache->bs);
kfree(cache);
}
@@ -2498,8 +2500,8 @@ static int cache_create(struct cache_args *ca, struct cache **result)
cache->features = ca->features;
if (writethrough_mode(cache)) {
/* Create bioset for writethrough bios issued to origin */
- cache->bs = bioset_create(BIO_POOL_SIZE, 0, 0);
- if (!cache->bs)
+ r = bioset_init(&cache->bs, BIO_POOL_SIZE, 0, 0);
+ if (r)
goto bad;
}
@@ -2630,9 +2632,9 @@ static int cache_create(struct cache_args *ca, struct cache **result)
goto bad;
}
- cache->migration_pool = mempool_create_slab_pool(MIGRATION_POOL_SIZE,
- migration_cache);
- if (!cache->migration_pool) {
+ r = mempool_init_slab_pool(&cache->migration_pool, MIGRATION_POOL_SIZE,
+ migration_cache);
+ if (r) {
*error = "Error creating cache's migration mempool";
goto bad;
}
diff --git a/drivers/md/dm-core.h b/drivers/md/dm-core.h
index 3222e21cbbf8..7d480c930eaf 100644
--- a/drivers/md/dm-core.h
+++ b/drivers/md/dm-core.h
@@ -31,6 +31,9 @@ struct dm_kobject_holder {
struct mapped_device {
struct mutex suspend_lock;
+ struct mutex table_devices_lock;
+ struct list_head table_devices;
+
/*
* The current mapping (struct dm_table *).
* Use dm_get_live_table{_fast} or take suspend_lock for
@@ -38,17 +41,14 @@ struct mapped_device {
*/
void __rcu *map;
- struct list_head table_devices;
- struct mutex table_devices_lock;
-
unsigned long flags;
- struct request_queue *queue;
- int numa_node_id;
-
- enum dm_queue_mode type;
/* Protect queue and type against concurrent access. */
struct mutex type_lock;
+ enum dm_queue_mode type;
+
+ int numa_node_id;
+ struct request_queue *queue;
atomic_t holders;
atomic_t open_count;
@@ -56,21 +56,21 @@ struct mapped_device {
struct dm_target *immutable_target;
struct target_type *immutable_target_type;
+ char name[16];
struct gendisk *disk;
struct dax_device *dax_dev;
- char name[16];
-
- void *interface_ptr;
/*
* A list of ios that arrived while we were suspended.
*/
- atomic_t pending[2];
- wait_queue_head_t wait;
struct work_struct work;
+ wait_queue_head_t wait;
+ atomic_t pending[2];
spinlock_t deferred_lock;
struct bio_list deferred;
+ void *interface_ptr;
+
/*
* Event handling.
*/
@@ -84,15 +84,15 @@ struct mapped_device {
unsigned internal_suspend_count;
/*
- * Processing queue (flush)
+ * io objects are allocated from here.
*/
- struct workqueue_struct *wq;
+ struct bio_set io_bs;
+ struct bio_set bs;
/*
- * io objects are allocated from here.
+ * Processing queue (flush)
*/
- struct bio_set *io_bs;
- struct bio_set *bs;
+ struct workqueue_struct *wq;
/*
* freeze/thaw support require holding onto a super block
@@ -102,11 +102,11 @@ struct mapped_device {
/* forced geometry settings */
struct hd_geometry geometry;
- struct block_device *bdev;
-
/* kobject and completion */
struct dm_kobject_holder kobj_holder;
+ struct block_device *bdev;
+
/* zero-length flush that will be cloned and submitted to targets */
struct bio flush_bio;
diff --git a/drivers/md/dm-crypt.c b/drivers/md/dm-crypt.c
index 44ff473dab3e..b61b069c33af 100644
--- a/drivers/md/dm-crypt.c
+++ b/drivers/md/dm-crypt.c
@@ -139,25 +139,13 @@ struct crypt_config {
struct dm_dev *dev;
sector_t start;
- /*
- * pool for per bio private data, crypto requests,
- * encryption requeusts/buffer pages and integrity tags
- */
- mempool_t *req_pool;
- mempool_t *page_pool;
- mempool_t *tag_pool;
- unsigned tag_pool_max_sectors;
-
struct percpu_counter n_allocated_pages;
- struct bio_set *bs;
- struct mutex bio_alloc_lock;
-
struct workqueue_struct *io_queue;
struct workqueue_struct *crypt_queue;
- struct task_struct *write_thread;
wait_queue_head_t write_thread_wait;
+ struct task_struct *write_thread;
struct rb_root write_tree;
char *cipher;
@@ -213,6 +201,18 @@ struct crypt_config {
unsigned int integrity_iv_size;
unsigned int on_disk_tag_size;
+ /*
+ * pool for per bio private data, crypto requests,
+ * encryption requeusts/buffer pages and integrity tags
+ */
+ unsigned tag_pool_max_sectors;
+ mempool_t tag_pool;
+ mempool_t req_pool;
+ mempool_t page_pool;
+
+ struct bio_set bs;
+ struct mutex bio_alloc_lock;
+
u8 *authenc_key; /* space for keys in authenc() format (if used) */
u8 key[0];
};
@@ -1245,7 +1245,7 @@ static void crypt_alloc_req_skcipher(struct crypt_config *cc,
unsigned key_index = ctx->cc_sector & (cc->tfms_count - 1);
if (!ctx->r.req)
- ctx->r.req = mempool_alloc(cc->req_pool, GFP_NOIO);
+ ctx->r.req = mempool_alloc(&cc->req_pool, GFP_NOIO);
skcipher_request_set_tfm(ctx->r.req, cc->cipher_tfm.tfms[key_index]);
@@ -1262,7 +1262,7 @@ static void crypt_alloc_req_aead(struct crypt_config *cc,
struct convert_context *ctx)
{
if (!ctx->r.req_aead)
- ctx->r.req_aead = mempool_alloc(cc->req_pool, GFP_NOIO);
+ ctx->r.req_aead = mempool_alloc(&cc->req_pool, GFP_NOIO);
aead_request_set_tfm(ctx->r.req_aead, cc->cipher_tfm.tfms_aead[0]);
@@ -1290,7 +1290,7 @@ static void crypt_free_req_skcipher(struct crypt_config *cc,
struct dm_crypt_io *io = dm_per_bio_data(base_bio, cc->per_bio_data_size);
if ((struct skcipher_request *)(io + 1) != req)
- mempool_free(req, cc->req_pool);
+ mempool_free(req, &cc->req_pool);
}
static void crypt_free_req_aead(struct crypt_config *cc,
@@ -1299,7 +1299,7 @@ static void crypt_free_req_aead(struct crypt_config *cc,
struct dm_crypt_io *io = dm_per_bio_data(base_bio, cc->per_bio_data_size);
if ((struct aead_request *)(io + 1) != req)
- mempool_free(req, cc->req_pool);
+ mempool_free(req, &cc->req_pool);
}
static void crypt_free_req(struct crypt_config *cc, void *req, struct bio *base_bio)
@@ -1409,7 +1409,7 @@ retry:
if (unlikely(gfp_mask & __GFP_DIRECT_RECLAIM))
mutex_lock(&cc->bio_alloc_lock);
- clone = bio_alloc_bioset(GFP_NOIO, nr_iovecs, cc->bs);
+ clone = bio_alloc_bioset(GFP_NOIO, nr_iovecs, &cc->bs);
if (!clone)
goto out;
@@ -1418,7 +1418,7 @@ retry:
remaining_size = size;
for (i = 0; i < nr_iovecs; i++) {
- page = mempool_alloc(cc->page_pool, gfp_mask);
+ page = mempool_alloc(&cc->page_pool, gfp_mask);
if (!page) {
crypt_free_buffer_pages(cc, clone);
bio_put(clone);
@@ -1453,7 +1453,7 @@ static void crypt_free_buffer_pages(struct crypt_config *cc, struct bio *clone)
bio_for_each_segment_all(bv, clone, i) {
BUG_ON(!bv->bv_page);
- mempool_free(bv->bv_page, cc->page_pool);
+ mempool_free(bv->bv_page, &cc->page_pool);
}
}
@@ -1492,7 +1492,7 @@ static void crypt_dec_pending(struct dm_crypt_io *io)
crypt_free_req(cc, io->ctx.r.req, base_bio);
if (unlikely(io->integrity_metadata_from_pool))
- mempool_free(io->integrity_metadata, io->cc->tag_pool);
+ mempool_free(io->integrity_metadata, &io->cc->tag_pool);
else
kfree(io->integrity_metadata);
@@ -1565,7 +1565,7 @@ static int kcryptd_io_read(struct dm_crypt_io *io, gfp_t gfp)
* biovecs we don't need to worry about the block layer
* modifying the biovec array; so leverage bio_clone_fast().
*/
- clone = bio_clone_fast(io->base_bio, gfp, cc->bs);
+ clone = bio_clone_fast(io->base_bio, gfp, &cc->bs);
if (!clone)
return 1;
@@ -1878,8 +1878,9 @@ static int crypt_alloc_tfms_skcipher(struct crypt_config *cc, char *ciphermode)
unsigned i;
int err;
- cc->cipher_tfm.tfms = kzalloc(cc->tfms_count *
- sizeof(struct crypto_skcipher *), GFP_KERNEL);
+ cc->cipher_tfm.tfms = kcalloc(cc->tfms_count,
+ sizeof(struct crypto_skcipher *),
+ GFP_KERNEL);
if (!cc->cipher_tfm.tfms)
return -ENOMEM;
@@ -2219,15 +2220,13 @@ static void crypt_dtr(struct dm_target *ti)
crypt_free_tfms(cc);
- if (cc->bs)
- bioset_free(cc->bs);
+ bioset_exit(&cc->bs);
- mempool_destroy(cc->page_pool);
- mempool_destroy(cc->req_pool);
- mempool_destroy(cc->tag_pool);
+ mempool_exit(&cc->page_pool);
+ mempool_exit(&cc->req_pool);
+ mempool_exit(&cc->tag_pool);
- if (cc->page_pool)
- WARN_ON(percpu_counter_sum(&cc->n_allocated_pages) != 0);
+ WARN_ON(percpu_counter_sum(&cc->n_allocated_pages) != 0);
percpu_counter_destroy(&cc->n_allocated_pages);
if (cc->iv_gen_ops && cc->iv_gen_ops->dtr)
@@ -2743,8 +2742,6 @@ static int crypt_ctr(struct dm_target *ti, unsigned int argc, char **argv)
iv_size_padding = align_mask;
}
- ret = -ENOMEM;
-
/* ...| IV + padding | original IV | original sec. number | bio tag offset | */
additional_req_size = sizeof(struct dm_crypt_request) +
iv_size_padding + cc->iv_size +
@@ -2752,8 +2749,8 @@ static int crypt_ctr(struct dm_target *ti, unsigned int argc, char **argv)
sizeof(uint64_t) +
sizeof(unsigned int);
- cc->req_pool = mempool_create_kmalloc_pool(MIN_IOS, cc->dmreq_start + additional_req_size);
- if (!cc->req_pool) {
+ ret = mempool_init_kmalloc_pool(&cc->req_pool, MIN_IOS, cc->dmreq_start + additional_req_size);
+ if (ret) {
ti->error = "Cannot allocate crypt request mempool";
goto bad;
}
@@ -2762,14 +2759,14 @@ static int crypt_ctr(struct dm_target *ti, unsigned int argc, char **argv)
ALIGN(sizeof(struct dm_crypt_io) + cc->dmreq_start + additional_req_size,
ARCH_KMALLOC_MINALIGN);
- cc->page_pool = mempool_create(BIO_MAX_PAGES, crypt_page_alloc, crypt_page_free, cc);
- if (!cc->page_pool) {
+ ret = mempool_init(&cc->page_pool, BIO_MAX_PAGES, crypt_page_alloc, crypt_page_free, cc);
+ if (ret) {
ti->error = "Cannot allocate page mempool";
goto bad;
}
- cc->bs = bioset_create(MIN_IOS, 0, BIOSET_NEED_BVECS);
- if (!cc->bs) {
+ ret = bioset_init(&cc->bs, MIN_IOS, 0, BIOSET_NEED_BVECS);
+ if (ret) {
ti->error = "Cannot allocate crypt bioset";
goto bad;
}
@@ -2806,11 +2803,10 @@ static int crypt_ctr(struct dm_target *ti, unsigned int argc, char **argv)
if (!cc->tag_pool_max_sectors)
cc->tag_pool_max_sectors = 1;
- cc->tag_pool = mempool_create_kmalloc_pool(MIN_IOS,
+ ret = mempool_init_kmalloc_pool(&cc->tag_pool, MIN_IOS,
cc->tag_pool_max_sectors * cc->on_disk_tag_size);
- if (!cc->tag_pool) {
+ if (ret) {
ti->error = "Cannot allocate integrity tags mempool";
- ret = -ENOMEM;
goto bad;
}
@@ -2903,7 +2899,7 @@ static int crypt_map(struct dm_target *ti, struct bio *bio)
GFP_NOIO | __GFP_NORETRY | __GFP_NOMEMALLOC | __GFP_NOWARN)))) {
if (bio_sectors(bio) > cc->tag_pool_max_sectors)
dm_accept_partial_bio(bio, cc->tag_pool_max_sectors);
- io->integrity_metadata = mempool_alloc(cc->tag_pool, GFP_NOIO);
+ io->integrity_metadata = mempool_alloc(&cc->tag_pool, GFP_NOIO);
io->integrity_metadata_from_pool = true;
}
}
diff --git a/drivers/md/dm-integrity.c b/drivers/md/dm-integrity.c
index 77d9fe58dae2..86438b2f10dd 100644
--- a/drivers/md/dm-integrity.c
+++ b/drivers/md/dm-integrity.c
@@ -142,7 +142,7 @@ struct dm_integrity_c {
unsigned tag_size;
__s8 log2_tag_size;
sector_t start;
- mempool_t *journal_io_mempool;
+ mempool_t journal_io_mempool;
struct dm_io_client *io;
struct dm_bufio_client *bufio;
struct workqueue_struct *metadata_wq;
@@ -1817,7 +1817,7 @@ static void complete_copy_from_journal(unsigned long error, void *context)
struct journal_completion *comp = io->comp;
struct dm_integrity_c *ic = comp->ic;
remove_range(ic, &io->range);
- mempool_free(io, ic->journal_io_mempool);
+ mempool_free(io, &ic->journal_io_mempool);
if (unlikely(error != 0))
dm_integrity_io_error(ic, "copying from journal", -EIO);
complete_journal_op(comp);
@@ -1886,7 +1886,7 @@ static void do_journal_write(struct dm_integrity_c *ic, unsigned write_start,
}
next_loop = k - 1;
- io = mempool_alloc(ic->journal_io_mempool, GFP_NOIO);
+ io = mempool_alloc(&ic->journal_io_mempool, GFP_NOIO);
io->comp = &comp;
io->range.logical_sector = sec;
io->range.n_sectors = (k - j) << ic->sb->log2_sectors_per_block;
@@ -1918,7 +1918,7 @@ static void do_journal_write(struct dm_integrity_c *ic, unsigned write_start,
if (j == k) {
remove_range_unlocked(ic, &io->range);
spin_unlock_irq(&ic->endio_wait.lock);
- mempool_free(io, ic->journal_io_mempool);
+ mempool_free(io, &ic->journal_io_mempool);
goto skip_io;
}
for (l = j; l < k; l++) {
@@ -2440,7 +2440,7 @@ static void dm_integrity_free_journal_scatterlist(struct dm_integrity_c *ic, str
unsigned i;
for (i = 0; i < ic->journal_sections; i++)
kvfree(sl[i]);
- kfree(sl);
+ kvfree(sl);
}
static struct scatterlist **dm_integrity_alloc_journal_scatterlist(struct dm_integrity_c *ic, struct page_list *pl)
@@ -2448,7 +2448,9 @@ static struct scatterlist **dm_integrity_alloc_journal_scatterlist(struct dm_int
struct scatterlist **sl;
unsigned i;
- sl = kvmalloc(ic->journal_sections * sizeof(struct scatterlist *), GFP_KERNEL | __GFP_ZERO);
+ sl = kvmalloc_array(ic->journal_sections,
+ sizeof(struct scatterlist *),
+ GFP_KERNEL | __GFP_ZERO);
if (!sl)
return NULL;
@@ -2464,7 +2466,8 @@ static struct scatterlist **dm_integrity_alloc_journal_scatterlist(struct dm_int
n_pages = (end_index - start_index + 1);
- s = kvmalloc(n_pages * sizeof(struct scatterlist), GFP_KERNEL);
+ s = kvmalloc_array(n_pages, sizeof(struct scatterlist),
+ GFP_KERNEL);
if (!s) {
dm_integrity_free_journal_scatterlist(ic, sl);
return NULL;
@@ -2643,7 +2646,9 @@ static int create_journal(struct dm_integrity_c *ic, char **error)
goto bad;
}
- sg = kvmalloc((ic->journal_pages + 1) * sizeof(struct scatterlist), GFP_KERNEL);
+ sg = kvmalloc_array(ic->journal_pages + 1,
+ sizeof(struct scatterlist),
+ GFP_KERNEL);
if (!sg) {
*error = "Unable to allocate sg list";
r = -ENOMEM;
@@ -2709,7 +2714,9 @@ static int create_journal(struct dm_integrity_c *ic, char **error)
r = -ENOMEM;
goto bad;
}
- ic->sk_requests = kvmalloc(ic->journal_sections * sizeof(struct skcipher_request *), GFP_KERNEL | __GFP_ZERO);
+ ic->sk_requests = kvmalloc_array(ic->journal_sections,
+ sizeof(struct skcipher_request *),
+ GFP_KERNEL | __GFP_ZERO);
if (!ic->sk_requests) {
*error = "Unable to allocate sk requests";
r = -ENOMEM;
@@ -2743,7 +2750,8 @@ static int create_journal(struct dm_integrity_c *ic, char **error)
r = -ENOMEM;
goto bad;
}
- section_req->iv = kmalloc(ivsize * 2, GFP_KERNEL);
+ section_req->iv = kmalloc_array(ivsize, 2,
+ GFP_KERNEL);
if (!section_req->iv) {
skcipher_request_free(section_req);
*error = "Unable to allocate iv";
@@ -2980,9 +2988,8 @@ static int dm_integrity_ctr(struct dm_target *ti, unsigned argc, char **argv)
goto bad;
}
- ic->journal_io_mempool = mempool_create_slab_pool(JOURNAL_IO_MEMPOOL, journal_io_cache);
- if (!ic->journal_io_mempool) {
- r = -ENOMEM;
+ r = mempool_init_slab_pool(&ic->journal_io_mempool, JOURNAL_IO_MEMPOOL, journal_io_cache);
+ if (r) {
ti->error = "Cannot allocate mempool";
goto bad;
}
@@ -3196,7 +3203,7 @@ static void dm_integrity_dtr(struct dm_target *ti)
destroy_workqueue(ic->writer_wq);
if (ic->bufio)
dm_bufio_client_destroy(ic->bufio);
- mempool_destroy(ic->journal_io_mempool);
+ mempool_exit(&ic->journal_io_mempool);
if (ic->io)
dm_io_client_destroy(ic->io);
if (ic->dev)
diff --git a/drivers/md/dm-io.c b/drivers/md/dm-io.c
index a8d914d5abbe..81ffc59d05c9 100644
--- a/drivers/md/dm-io.c
+++ b/drivers/md/dm-io.c
@@ -22,8 +22,8 @@
#define DM_IO_MAX_REGIONS BITS_PER_LONG
struct dm_io_client {
- mempool_t *pool;
- struct bio_set *bios;
+ mempool_t pool;
+ struct bio_set bios;
};
/*
@@ -49,32 +49,33 @@ struct dm_io_client *dm_io_client_create(void)
{
struct dm_io_client *client;
unsigned min_ios = dm_get_reserved_bio_based_ios();
+ int ret;
- client = kmalloc(sizeof(*client), GFP_KERNEL);
+ client = kzalloc(sizeof(*client), GFP_KERNEL);
if (!client)
return ERR_PTR(-ENOMEM);
- client->pool = mempool_create_slab_pool(min_ios, _dm_io_cache);
- if (!client->pool)
+ ret = mempool_init_slab_pool(&client->pool, min_ios, _dm_io_cache);
+ if (ret)
goto bad;
- client->bios = bioset_create(min_ios, 0, BIOSET_NEED_BVECS);
- if (!client->bios)
+ ret = bioset_init(&client->bios, min_ios, 0, BIOSET_NEED_BVECS);
+ if (ret)
goto bad;
return client;
bad:
- mempool_destroy(client->pool);
+ mempool_exit(&client->pool);
kfree(client);
- return ERR_PTR(-ENOMEM);
+ return ERR_PTR(ret);
}
EXPORT_SYMBOL(dm_io_client_create);
void dm_io_client_destroy(struct dm_io_client *client)
{
- mempool_destroy(client->pool);
- bioset_free(client->bios);
+ mempool_exit(&client->pool);
+ bioset_exit(&client->bios);
kfree(client);
}
EXPORT_SYMBOL(dm_io_client_destroy);
@@ -120,7 +121,7 @@ static void complete_io(struct io *io)
invalidate_kernel_vmap_range(io->vma_invalidate_address,
io->vma_invalidate_size);
- mempool_free(io, io->client->pool);
+ mempool_free(io, &io->client->pool);
fn(error_bits, context);
}
@@ -344,7 +345,7 @@ static void do_region(int op, int op_flags, unsigned region,
dm_sector_div_up(remaining, (PAGE_SIZE >> SECTOR_SHIFT)));
}
- bio = bio_alloc_bioset(GFP_NOIO, num_bvecs, io->client->bios);
+ bio = bio_alloc_bioset(GFP_NOIO, num_bvecs, &io->client->bios);
bio->bi_iter.bi_sector = where->sector + (where->count - remaining);
bio_set_dev(bio, where->bdev);
bio->bi_end_io = endio;
@@ -442,7 +443,7 @@ static int sync_io(struct dm_io_client *client, unsigned int num_regions,
init_completion(&sio.wait);
- io = mempool_alloc(client->pool, GFP_NOIO);
+ io = mempool_alloc(&client->pool, GFP_NOIO);
io->error_bits = 0;
atomic_set(&io->count, 1); /* see dispatch_io() */
io->client = client;
@@ -474,7 +475,7 @@ static int async_io(struct dm_io_client *client, unsigned int num_regions,
return -EIO;
}
- io = mempool_alloc(client->pool, GFP_NOIO);
+ io = mempool_alloc(&client->pool, GFP_NOIO);
io->error_bits = 0;
atomic_set(&io->count, 1); /* see dispatch_io() */
io->client = client;
diff --git a/drivers/md/dm-ioctl.c b/drivers/md/dm-ioctl.c
index 5acf77de5945..b810ea77e6b1 100644
--- a/drivers/md/dm-ioctl.c
+++ b/drivers/md/dm-ioctl.c
@@ -1344,7 +1344,8 @@ static int table_load(struct file *filp, struct dm_ioctl *param, size_t param_si
goto err_unlock_md_type;
}
} else if (!is_valid_type(dm_get_md_type(md), dm_table_get_type(t))) {
- DMWARN("can't change device type after initial table load.");
+ DMWARN("can't change device type (old=%u vs new=%u) after initial table load.",
+ dm_get_md_type(md), dm_table_get_type(t));
r = -EINVAL;
goto err_unlock_md_type;
}
diff --git a/drivers/md/dm-kcopyd.c b/drivers/md/dm-kcopyd.c
index e6e7c686646d..3c7547a3c371 100644
--- a/drivers/md/dm-kcopyd.c
+++ b/drivers/md/dm-kcopyd.c
@@ -45,15 +45,16 @@ struct dm_kcopyd_client {
struct dm_io_client *io_client;
wait_queue_head_t destroyq;
- atomic_t nr_jobs;
- mempool_t *job_pool;
+ mempool_t job_pool;
struct workqueue_struct *kcopyd_wq;
struct work_struct kcopyd_work;
struct dm_kcopyd_throttle *throttle;
+ atomic_t nr_jobs;
+
/*
* We maintain three lists of jobs:
*
@@ -479,7 +480,7 @@ static int run_complete_job(struct kcopyd_job *job)
*/
if (job->master_job == job) {
mutex_destroy(&job->lock);
- mempool_free(job, kc->job_pool);
+ mempool_free(job, &kc->job_pool);
}
fn(read_err, write_err, context);
@@ -751,7 +752,7 @@ int dm_kcopyd_copy(struct dm_kcopyd_client *kc, struct dm_io_region *from,
* Allocate an array of jobs consisting of one master job
* followed by SPLIT_COUNT sub jobs.
*/
- job = mempool_alloc(kc->job_pool, GFP_NOIO);
+ job = mempool_alloc(&kc->job_pool, GFP_NOIO);
mutex_init(&job->lock);
/*
@@ -835,7 +836,7 @@ void *dm_kcopyd_prepare_callback(struct dm_kcopyd_client *kc,
{
struct kcopyd_job *job;
- job = mempool_alloc(kc->job_pool, GFP_NOIO);
+ job = mempool_alloc(&kc->job_pool, GFP_NOIO);
memset(job, 0, sizeof(struct kcopyd_job));
job->kc = kc;
@@ -879,10 +880,10 @@ int kcopyd_cancel(struct kcopyd_job *job, int block)
*---------------------------------------------------------------*/
struct dm_kcopyd_client *dm_kcopyd_client_create(struct dm_kcopyd_throttle *throttle)
{
- int r = -ENOMEM;
+ int r;
struct dm_kcopyd_client *kc;
- kc = kmalloc(sizeof(*kc), GFP_KERNEL);
+ kc = kzalloc(sizeof(*kc), GFP_KERNEL);
if (!kc)
return ERR_PTR(-ENOMEM);
@@ -892,14 +893,16 @@ struct dm_kcopyd_client *dm_kcopyd_client_create(struct dm_kcopyd_throttle *thro
INIT_LIST_HEAD(&kc->pages_jobs);
kc->throttle = throttle;
- kc->job_pool = mempool_create_slab_pool(MIN_JOBS, _job_cache);
- if (!kc->job_pool)
+ r = mempool_init_slab_pool(&kc->job_pool, MIN_JOBS, _job_cache);
+ if (r)
goto bad_slab;
INIT_WORK(&kc->kcopyd_work, do_work);
kc->kcopyd_wq = alloc_workqueue("kcopyd", WQ_MEM_RECLAIM, 0);
- if (!kc->kcopyd_wq)
+ if (!kc->kcopyd_wq) {
+ r = -ENOMEM;
goto bad_workqueue;
+ }
kc->pages = NULL;
kc->nr_reserved_pages = kc->nr_free_pages = 0;
@@ -923,7 +926,7 @@ bad_io_client:
bad_client_pages:
destroy_workqueue(kc->kcopyd_wq);
bad_workqueue:
- mempool_destroy(kc->job_pool);
+ mempool_exit(&kc->job_pool);
bad_slab:
kfree(kc);
@@ -942,7 +945,7 @@ void dm_kcopyd_client_destroy(struct dm_kcopyd_client *kc)
destroy_workqueue(kc->kcopyd_wq);
dm_io_client_destroy(kc->io_client);
client_free_pages(kc);
- mempool_destroy(kc->job_pool);
+ mempool_exit(&kc->job_pool);
kfree(kc);
}
EXPORT_SYMBOL(dm_kcopyd_client_destroy);
diff --git a/drivers/md/dm-linear.c b/drivers/md/dm-linear.c
index 775c06d953b7..d10964d41fd7 100644
--- a/drivers/md/dm-linear.c
+++ b/drivers/md/dm-linear.c
@@ -185,9 +185,24 @@ static size_t linear_dax_copy_from_iter(struct dm_target *ti, pgoff_t pgoff,
return dax_copy_from_iter(dax_dev, pgoff, addr, bytes, i);
}
+static size_t linear_dax_copy_to_iter(struct dm_target *ti, pgoff_t pgoff,
+ void *addr, size_t bytes, struct iov_iter *i)
+{
+ struct linear_c *lc = ti->private;
+ struct block_device *bdev = lc->dev->bdev;
+ struct dax_device *dax_dev = lc->dev->dax_dev;
+ sector_t dev_sector, sector = pgoff * PAGE_SECTORS;
+
+ dev_sector = linear_map_sector(ti, sector);
+ if (bdev_dax_pgoff(bdev, dev_sector, ALIGN(bytes, PAGE_SIZE), &pgoff))
+ return 0;
+ return dax_copy_to_iter(dax_dev, pgoff, addr, bytes, i);
+}
+
#else
#define linear_dax_direct_access NULL
#define linear_dax_copy_from_iter NULL
+#define linear_dax_copy_to_iter NULL
#endif
static struct target_type linear_target = {
@@ -204,6 +219,7 @@ static struct target_type linear_target = {
.iterate_devices = linear_iterate_devices,
.direct_access = linear_dax_direct_access,
.dax_copy_from_iter = linear_dax_copy_from_iter,
+ .dax_copy_to_iter = linear_dax_copy_to_iter,
};
int __init dm_linear_init(void)
diff --git a/drivers/md/dm-log-userspace-base.c b/drivers/md/dm-log-userspace-base.c
index 53b7b06d0aa8..52090bee17c2 100644
--- a/drivers/md/dm-log-userspace-base.c
+++ b/drivers/md/dm-log-userspace-base.c
@@ -76,7 +76,7 @@ struct log_c {
*/
uint32_t integrated_flush;
- mempool_t *flush_entry_pool;
+ mempool_t flush_entry_pool;
};
static struct kmem_cache *_flush_entry_cache;
@@ -249,11 +249,10 @@ static int userspace_ctr(struct dm_dirty_log *log, struct dm_target *ti,
goto out;
}
- lc->flush_entry_pool = mempool_create_slab_pool(FLUSH_ENTRY_POOL_SIZE,
- _flush_entry_cache);
- if (!lc->flush_entry_pool) {
+ r = mempool_init_slab_pool(&lc->flush_entry_pool, FLUSH_ENTRY_POOL_SIZE,
+ _flush_entry_cache);
+ if (r) {
DMERR("Failed to create flush_entry_pool");
- r = -ENOMEM;
goto out;
}
@@ -313,7 +312,7 @@ static int userspace_ctr(struct dm_dirty_log *log, struct dm_target *ti,
out:
kfree(devices_rdata);
if (r) {
- mempool_destroy(lc->flush_entry_pool);
+ mempool_exit(&lc->flush_entry_pool);
kfree(lc);
kfree(ctr_str);
} else {
@@ -342,7 +341,7 @@ static void userspace_dtr(struct dm_dirty_log *log)
if (lc->log_dev)
dm_put_device(lc->ti, lc->log_dev);
- mempool_destroy(lc->flush_entry_pool);
+ mempool_exit(&lc->flush_entry_pool);
kfree(lc->usr_argv_str);
kfree(lc);
@@ -570,7 +569,7 @@ static int userspace_flush(struct dm_dirty_log *log)
int mark_list_is_empty;
int clear_list_is_empty;
struct dm_dirty_log_flush_entry *fe, *tmp_fe;
- mempool_t *flush_entry_pool = lc->flush_entry_pool;
+ mempool_t *flush_entry_pool = &lc->flush_entry_pool;
spin_lock_irqsave(&lc->flush_lock, flags);
list_splice_init(&lc->mark_list, &mark_list);
@@ -653,7 +652,7 @@ static void userspace_mark_region(struct dm_dirty_log *log, region_t region)
struct dm_dirty_log_flush_entry *fe;
/* Wait for an allocation, but _never_ fail */
- fe = mempool_alloc(lc->flush_entry_pool, GFP_NOIO);
+ fe = mempool_alloc(&lc->flush_entry_pool, GFP_NOIO);
BUG_ON(!fe);
spin_lock_irqsave(&lc->flush_lock, flags);
@@ -687,7 +686,7 @@ static void userspace_clear_region(struct dm_dirty_log *log, region_t region)
* to cause the region to be resync'ed when the
* device is activated next time.
*/
- fe = mempool_alloc(lc->flush_entry_pool, GFP_ATOMIC);
+ fe = mempool_alloc(&lc->flush_entry_pool, GFP_ATOMIC);
if (!fe) {
DMERR("Failed to allocate memory to clear region.");
return;
diff --git a/drivers/md/dm-log-writes.c b/drivers/md/dm-log-writes.c
index c90c7c08a77f..9ea2b0291f20 100644
--- a/drivers/md/dm-log-writes.c
+++ b/drivers/md/dm-log-writes.c
@@ -962,9 +962,23 @@ static size_t log_writes_dax_copy_from_iter(struct dm_target *ti,
dax_copy:
return dax_copy_from_iter(lc->dev->dax_dev, pgoff, addr, bytes, i);
}
+
+static size_t log_writes_dax_copy_to_iter(struct dm_target *ti,
+ pgoff_t pgoff, void *addr, size_t bytes,
+ struct iov_iter *i)
+{
+ struct log_writes_c *lc = ti->private;
+ sector_t sector = pgoff * PAGE_SECTORS;
+
+ if (bdev_dax_pgoff(lc->dev->bdev, sector, ALIGN(bytes, PAGE_SIZE), &pgoff))
+ return 0;
+ return dax_copy_to_iter(lc->dev->dax_dev, pgoff, addr, bytes, i);
+}
+
#else
#define log_writes_dax_direct_access NULL
#define log_writes_dax_copy_from_iter NULL
+#define log_writes_dax_copy_to_iter NULL
#endif
static struct target_type log_writes_target = {
@@ -982,6 +996,7 @@ static struct target_type log_writes_target = {
.io_hints = log_writes_io_hints,
.direct_access = log_writes_dax_direct_access,
.dax_copy_from_iter = log_writes_dax_copy_from_iter,
+ .dax_copy_to_iter = log_writes_dax_copy_to_iter,
};
static int __init dm_log_writes_init(void)
diff --git a/drivers/md/dm-mpath.c b/drivers/md/dm-mpath.c
index 203a0419d2b0..d94ba6f72ff5 100644
--- a/drivers/md/dm-mpath.c
+++ b/drivers/md/dm-mpath.c
@@ -520,7 +520,8 @@ static int multipath_clone_and_map(struct dm_target *ti, struct request *rq,
bdev = pgpath->path.dev->bdev;
q = bdev_get_queue(bdev);
- clone = blk_get_request(q, rq->cmd_flags | REQ_NOMERGE, GFP_ATOMIC);
+ clone = blk_get_request(q, rq->cmd_flags | REQ_NOMERGE,
+ BLK_MQ_REQ_NOWAIT);
if (IS_ERR(clone)) {
/* EBUSY, ENODEV or EWOULDBLOCK: requeue */
if (blk_queue_dying(q)) {
diff --git a/drivers/md/dm-raid.c b/drivers/md/dm-raid.c
index 6f823f44b4aa..75df4c9d8b54 100644
--- a/drivers/md/dm-raid.c
+++ b/drivers/md/dm-raid.c
@@ -588,7 +588,7 @@ static const char *raid10_md_layout_to_format(int layout)
}
/* Return md raid10 algorithm for @name */
-static const int raid10_name_to_format(const char *name)
+static int raid10_name_to_format(const char *name)
{
if (!strcasecmp(name, "near"))
return ALGORITHM_RAID10_NEAR;
@@ -756,7 +756,7 @@ static struct raid_set *raid_set_alloc(struct dm_target *ti, struct raid_type *r
return ERR_PTR(-EINVAL);
}
- rs = kzalloc(sizeof(*rs) + raid_devs * sizeof(rs->dev[0]), GFP_KERNEL);
+ rs = kzalloc(struct_size(rs, dev, raid_devs), GFP_KERNEL);
if (!rs) {
ti->error = "Cannot allocate raid context";
return ERR_PTR(-ENOMEM);
diff --git a/drivers/md/dm-raid1.c b/drivers/md/dm-raid1.c
index 580c49cc8079..5903e492bb34 100644
--- a/drivers/md/dm-raid1.c
+++ b/drivers/md/dm-raid1.c
@@ -23,6 +23,8 @@
#define MAX_RECOVERY 1 /* Maximum number of regions recovered in parallel. */
+#define MAX_NR_MIRRORS (DM_KCOPYD_MAX_REGIONS + 1)
+
#define DM_RAID1_HANDLE_ERRORS 0x01
#define DM_RAID1_KEEP_LOG 0x02
#define errors_handled(p) ((p)->features & DM_RAID1_HANDLE_ERRORS)
@@ -255,7 +257,7 @@ static int mirror_flush(struct dm_target *ti)
unsigned long error_bits;
unsigned int i;
- struct dm_io_region io[ms->nr_mirrors];
+ struct dm_io_region io[MAX_NR_MIRRORS];
struct mirror *m;
struct dm_io_request io_req = {
.bi_op = REQ_OP_WRITE,
@@ -651,7 +653,7 @@ static void write_callback(unsigned long error, void *context)
static void do_write(struct mirror_set *ms, struct bio *bio)
{
unsigned int i;
- struct dm_io_region io[ms->nr_mirrors], *dest = io;
+ struct dm_io_region io[MAX_NR_MIRRORS], *dest = io;
struct mirror *m;
struct dm_io_request io_req = {
.bi_op = REQ_OP_WRITE,
@@ -1083,7 +1085,7 @@ static int mirror_ctr(struct dm_target *ti, unsigned int argc, char **argv)
argc -= args_used;
if (!argc || sscanf(argv[0], "%u%c", &nr_mirrors, &dummy) != 1 ||
- nr_mirrors < 2 || nr_mirrors > DM_KCOPYD_MAX_REGIONS + 1) {
+ nr_mirrors < 2 || nr_mirrors > MAX_NR_MIRRORS) {
ti->error = "Invalid number of mirrors";
dm_dirty_log_destroy(dl);
return -EINVAL;
@@ -1404,7 +1406,7 @@ static void mirror_status(struct dm_target *ti, status_type_t type,
int num_feature_args = 0;
struct mirror_set *ms = (struct mirror_set *) ti->private;
struct dm_dirty_log *log = dm_rh_dirty_log(ms->rh);
- char buffer[ms->nr_mirrors + 1];
+ char buffer[MAX_NR_MIRRORS + 1];
switch (type) {
case STATUSTYPE_INFO:
diff --git a/drivers/md/dm-region-hash.c b/drivers/md/dm-region-hash.c
index 85c32b22a420..1f760451e6f4 100644
--- a/drivers/md/dm-region-hash.c
+++ b/drivers/md/dm-region-hash.c
@@ -63,27 +63,28 @@ struct dm_region_hash {
/* hash table */
rwlock_t hash_lock;
- mempool_t *region_pool;
unsigned mask;
unsigned nr_buckets;
unsigned prime;
unsigned shift;
struct list_head *buckets;
+ /*
+ * If there was a flush failure no regions can be marked clean.
+ */
+ int flush_failure;
+
unsigned max_recovery; /* Max # of regions to recover in parallel */
spinlock_t region_lock;
atomic_t recovery_in_flight;
- struct semaphore recovery_count;
struct list_head clean_regions;
struct list_head quiesced_regions;
struct list_head recovered_regions;
struct list_head failed_recovered_regions;
+ struct semaphore recovery_count;
- /*
- * If there was a flush failure no regions can be marked clean.
- */
- int flush_failure;
+ mempool_t region_pool;
void *context;
sector_t target_begin;
@@ -169,6 +170,7 @@ struct dm_region_hash *dm_region_hash_create(
struct dm_region_hash *rh;
unsigned nr_buckets, max_buckets;
size_t i;
+ int ret;
/*
* Calculate a suitable number of buckets for our hash
@@ -179,7 +181,7 @@ struct dm_region_hash *dm_region_hash_create(
;
nr_buckets >>= 1;
- rh = kmalloc(sizeof(*rh), GFP_KERNEL);
+ rh = kzalloc(sizeof(*rh), GFP_KERNEL);
if (!rh) {
DMERR("unable to allocate region hash memory");
return ERR_PTR(-ENOMEM);
@@ -201,7 +203,7 @@ struct dm_region_hash *dm_region_hash_create(
rh->shift = RH_HASH_SHIFT;
rh->prime = RH_HASH_MULT;
- rh->buckets = vmalloc(nr_buckets * sizeof(*rh->buckets));
+ rh->buckets = vmalloc(array_size(nr_buckets, sizeof(*rh->buckets)));
if (!rh->buckets) {
DMERR("unable to allocate region hash bucket memory");
kfree(rh);
@@ -220,9 +222,9 @@ struct dm_region_hash *dm_region_hash_create(
INIT_LIST_HEAD(&rh->failed_recovered_regions);
rh->flush_failure = 0;
- rh->region_pool = mempool_create_kmalloc_pool(MIN_REGIONS,
- sizeof(struct dm_region));
- if (!rh->region_pool) {
+ ret = mempool_init_kmalloc_pool(&rh->region_pool, MIN_REGIONS,
+ sizeof(struct dm_region));
+ if (ret) {
vfree(rh->buckets);
kfree(rh);
rh = ERR_PTR(-ENOMEM);
@@ -242,14 +244,14 @@ void dm_region_hash_destroy(struct dm_region_hash *rh)
list_for_each_entry_safe(reg, nreg, rh->buckets + h,
hash_list) {
BUG_ON(atomic_read(&reg->pending));
- mempool_free(reg, rh->region_pool);
+ mempool_free(reg, &rh->region_pool);
}
}
if (rh->log)
dm_dirty_log_destroy(rh->log);
- mempool_destroy(rh->region_pool);
+ mempool_exit(&rh->region_pool);
vfree(rh->buckets);
kfree(rh);
}
@@ -287,7 +289,7 @@ static struct dm_region *__rh_alloc(struct dm_region_hash *rh, region_t region)
{
struct dm_region *reg, *nreg;
- nreg = mempool_alloc(rh->region_pool, GFP_ATOMIC);
+ nreg = mempool_alloc(&rh->region_pool, GFP_ATOMIC);
if (unlikely(!nreg))
nreg = kmalloc(sizeof(*nreg), GFP_NOIO | __GFP_NOFAIL);
@@ -303,7 +305,7 @@ static struct dm_region *__rh_alloc(struct dm_region_hash *rh, region_t region)
reg = __rh_lookup(rh, region);
if (reg)
/* We lost the race. */
- mempool_free(nreg, rh->region_pool);
+ mempool_free(nreg, &rh->region_pool);
else {
__rh_insert(rh, nreg);
if (nreg->state == DM_RH_CLEAN) {
@@ -481,17 +483,17 @@ void dm_rh_update_states(struct dm_region_hash *rh, int errors_handled)
list_for_each_entry_safe(reg, next, &recovered, list) {
rh->log->type->clear_region(rh->log, reg->key);
complete_resync_work(reg, 1);
- mempool_free(reg, rh->region_pool);
+ mempool_free(reg, &rh->region_pool);
}
list_for_each_entry_safe(reg, next, &failed_recovered, list) {
complete_resync_work(reg, errors_handled ? 0 : 1);
- mempool_free(reg, rh->region_pool);
+ mempool_free(reg, &rh->region_pool);
}
list_for_each_entry_safe(reg, next, &clean, list) {
rh->log->type->clear_region(rh->log, reg->key);
- mempool_free(reg, rh->region_pool);
+ mempool_free(reg, &rh->region_pool);
}
rh->log->type->flush(rh->log);
diff --git a/drivers/md/dm-rq.c b/drivers/md/dm-rq.c
index bf0b840645cc..6e547b8dd298 100644
--- a/drivers/md/dm-rq.c
+++ b/drivers/md/dm-rq.c
@@ -406,7 +406,7 @@ static blk_status_t dm_dispatch_clone_request(struct request *clone, struct requ
if (blk_queue_io_stat(clone->q))
clone->rq_flags |= RQF_IO_STAT;
- clone->start_time = jiffies;
+ clone->start_time_ns = ktime_get_ns();
r = blk_insert_cloned_request(clone->q, clone);
if (r != BLK_STS_OK && r != BLK_STS_RESOURCE && r != BLK_STS_DEV_RESOURCE)
/* must complete clone in terms of original request */
@@ -433,7 +433,7 @@ static int setup_clone(struct request *clone, struct request *rq,
{
int r;
- r = blk_rq_prep_clone(clone, rq, tio->md->bs, gfp_mask,
+ r = blk_rq_prep_clone(clone, rq, &tio->md->bs, gfp_mask,
dm_rq_bio_constructor, tio);
if (r)
return r;
diff --git a/drivers/md/dm-snap.c b/drivers/md/dm-snap.c
index 216035be5661..97de7a7334d4 100644
--- a/drivers/md/dm-snap.c
+++ b/drivers/md/dm-snap.c
@@ -87,7 +87,7 @@ struct dm_snapshot {
*/
struct list_head out_of_order_list;
- mempool_t *pending_pool;
+ mempool_t pending_pool;
struct dm_exception_table pending;
struct dm_exception_table complete;
@@ -326,8 +326,8 @@ static int init_origin_hash(void)
{
int i;
- _origins = kmalloc(ORIGIN_HASH_SIZE * sizeof(struct list_head),
- GFP_KERNEL);
+ _origins = kmalloc_array(ORIGIN_HASH_SIZE, sizeof(struct list_head),
+ GFP_KERNEL);
if (!_origins) {
DMERR("unable to allocate memory for _origins");
return -ENOMEM;
@@ -335,8 +335,9 @@ static int init_origin_hash(void)
for (i = 0; i < ORIGIN_HASH_SIZE; i++)
INIT_LIST_HEAD(_origins + i);
- _dm_origins = kmalloc(ORIGIN_HASH_SIZE * sizeof(struct list_head),
- GFP_KERNEL);
+ _dm_origins = kmalloc_array(ORIGIN_HASH_SIZE,
+ sizeof(struct list_head),
+ GFP_KERNEL);
if (!_dm_origins) {
DMERR("unable to allocate memory for _dm_origins");
kfree(_origins);
@@ -682,7 +683,7 @@ static void free_completed_exception(struct dm_exception *e)
static struct dm_snap_pending_exception *alloc_pending_exception(struct dm_snapshot *s)
{
- struct dm_snap_pending_exception *pe = mempool_alloc(s->pending_pool,
+ struct dm_snap_pending_exception *pe = mempool_alloc(&s->pending_pool,
GFP_NOIO);
atomic_inc(&s->pending_exceptions_count);
@@ -695,7 +696,7 @@ static void free_pending_exception(struct dm_snap_pending_exception *pe)
{
struct dm_snapshot *s = pe->snap;
- mempool_free(pe, s->pending_pool);
+ mempool_free(pe, &s->pending_pool);
smp_mb__before_atomic();
atomic_dec(&s->pending_exceptions_count);
}
@@ -1120,7 +1121,7 @@ static int snapshot_ctr(struct dm_target *ti, unsigned int argc, char **argv)
origin_mode = FMODE_WRITE;
}
- s = kmalloc(sizeof(*s), GFP_KERNEL);
+ s = kzalloc(sizeof(*s), GFP_KERNEL);
if (!s) {
ti->error = "Cannot allocate private snapshot structure";
r = -ENOMEM;
@@ -1196,10 +1197,9 @@ static int snapshot_ctr(struct dm_target *ti, unsigned int argc, char **argv)
goto bad_kcopyd;
}
- s->pending_pool = mempool_create_slab_pool(MIN_IOS, pending_cache);
- if (!s->pending_pool) {
+ r = mempool_init_slab_pool(&s->pending_pool, MIN_IOS, pending_cache);
+ if (r) {
ti->error = "Could not allocate mempool for pending exceptions";
- r = -ENOMEM;
goto bad_pending_pool;
}
@@ -1259,7 +1259,7 @@ bad_read_metadata:
unregister_snapshot(s);
bad_load_and_register:
- mempool_destroy(s->pending_pool);
+ mempool_exit(&s->pending_pool);
bad_pending_pool:
dm_kcopyd_client_destroy(s->kcopyd_client);
@@ -1355,7 +1355,7 @@ static void snapshot_dtr(struct dm_target *ti)
while (atomic_read(&s->pending_exceptions_count))
msleep(1);
/*
- * Ensure instructions in mempool_destroy aren't reordered
+ * Ensure instructions in mempool_exit aren't reordered
* before atomic_read.
*/
smp_mb();
@@ -1367,7 +1367,7 @@ static void snapshot_dtr(struct dm_target *ti)
__free_exceptions(s);
- mempool_destroy(s->pending_pool);
+ mempool_exit(&s->pending_pool);
dm_exception_store_destroy(s->store);
diff --git a/drivers/md/dm-stats.c b/drivers/md/dm-stats.c
index 56059fb56e2d..21de30b4e2a1 100644
--- a/drivers/md/dm-stats.c
+++ b/drivers/md/dm-stats.c
@@ -915,7 +915,9 @@ static int parse_histogram(const char *h, unsigned *n_histogram_entries,
if (*q == ',')
(*n_histogram_entries)++;
- *histogram_boundaries = kmalloc(*n_histogram_entries * sizeof(unsigned long long), GFP_KERNEL);
+ *histogram_boundaries = kmalloc_array(*n_histogram_entries,
+ sizeof(unsigned long long),
+ GFP_KERNEL);
if (!*histogram_boundaries)
return -ENOMEM;
diff --git a/drivers/md/dm-stripe.c b/drivers/md/dm-stripe.c
index fe7fb9b1aec3..8547d7594338 100644
--- a/drivers/md/dm-stripe.c
+++ b/drivers/md/dm-stripe.c
@@ -354,9 +354,29 @@ static size_t stripe_dax_copy_from_iter(struct dm_target *ti, pgoff_t pgoff,
return dax_copy_from_iter(dax_dev, pgoff, addr, bytes, i);
}
+static size_t stripe_dax_copy_to_iter(struct dm_target *ti, pgoff_t pgoff,
+ void *addr, size_t bytes, struct iov_iter *i)
+{
+ sector_t dev_sector, sector = pgoff * PAGE_SECTORS;
+ struct stripe_c *sc = ti->private;
+ struct dax_device *dax_dev;
+ struct block_device *bdev;
+ uint32_t stripe;
+
+ stripe_map_sector(sc, sector, &stripe, &dev_sector);
+ dev_sector += sc->stripe[stripe].physical_start;
+ dax_dev = sc->stripe[stripe].dev->dax_dev;
+ bdev = sc->stripe[stripe].dev->bdev;
+
+ if (bdev_dax_pgoff(bdev, dev_sector, ALIGN(bytes, PAGE_SIZE), &pgoff))
+ return 0;
+ return dax_copy_to_iter(dax_dev, pgoff, addr, bytes, i);
+}
+
#else
#define stripe_dax_direct_access NULL
#define stripe_dax_copy_from_iter NULL
+#define stripe_dax_copy_to_iter NULL
#endif
/*
@@ -478,6 +498,7 @@ static struct target_type stripe_target = {
.io_hints = stripe_io_hints,
.direct_access = stripe_dax_direct_access,
.dax_copy_from_iter = stripe_dax_copy_from_iter,
+ .dax_copy_to_iter = stripe_dax_copy_to_iter,
};
int __init dm_stripe_init(void)
diff --git a/drivers/md/dm-switch.c b/drivers/md/dm-switch.c
index 7924a6a33ddc..fae35caf3672 100644
--- a/drivers/md/dm-switch.c
+++ b/drivers/md/dm-switch.c
@@ -114,7 +114,8 @@ static int alloc_region_table(struct dm_target *ti, unsigned nr_paths)
return -EINVAL;
}
- sctx->region_table = vmalloc(nr_slots * sizeof(region_table_slot_t));
+ sctx->region_table = vmalloc(array_size(nr_slots,
+ sizeof(region_table_slot_t)));
if (!sctx->region_table) {
ti->error = "Cannot allocate region table";
return -ENOMEM;
diff --git a/drivers/md/dm-table.c b/drivers/md/dm-table.c
index 0589a4da12bb..3d0e2c198f06 100644
--- a/drivers/md/dm-table.c
+++ b/drivers/md/dm-table.c
@@ -548,23 +548,23 @@ static int adjoin(struct dm_table *table, struct dm_target *ti)
* On the other hand, dm-switch needs to process bulk data using messages and
* excessive use of GFP_NOIO could cause trouble.
*/
-static char **realloc_argv(unsigned *array_size, char **old_argv)
+static char **realloc_argv(unsigned *size, char **old_argv)
{
char **argv;
unsigned new_size;
gfp_t gfp;
- if (*array_size) {
- new_size = *array_size * 2;
+ if (*size) {
+ new_size = *size * 2;
gfp = GFP_KERNEL;
} else {
new_size = 8;
gfp = GFP_NOIO;
}
- argv = kmalloc(new_size * sizeof(*argv), gfp);
+ argv = kmalloc_array(new_size, sizeof(*argv), gfp);
if (argv) {
- memcpy(argv, old_argv, *array_size * sizeof(*argv));
- *array_size = new_size;
+ memcpy(argv, old_argv, *size * sizeof(*argv));
+ *size = new_size;
}
kfree(old_argv);
@@ -885,9 +885,7 @@ EXPORT_SYMBOL_GPL(dm_table_set_type);
static int device_supports_dax(struct dm_target *ti, struct dm_dev *dev,
sector_t start, sector_t len, void *data)
{
- struct request_queue *q = bdev_get_queue(dev->bdev);
-
- return q && blk_queue_dax(q);
+ return bdev_dax_supported(dev->bdev, PAGE_SIZE);
}
static bool dm_table_supports_dax(struct dm_table *t)
@@ -1907,6 +1905,9 @@ void dm_table_set_restrictions(struct dm_table *t, struct request_queue *q,
if (dm_table_supports_dax(t))
blk_queue_flag_set(QUEUE_FLAG_DAX, q);
+ else
+ blk_queue_flag_clear(QUEUE_FLAG_DAX, q);
+
if (dm_table_supports_dax_write_cache(t))
dax_write_cache(t->md->dax_dev, true);
diff --git a/drivers/md/dm-thin-metadata.c b/drivers/md/dm-thin-metadata.c
index 36ef284ad086..72142021b5c9 100644
--- a/drivers/md/dm-thin-metadata.c
+++ b/drivers/md/dm-thin-metadata.c
@@ -776,7 +776,6 @@ static int __write_changed_details(struct dm_pool_metadata *pmd)
static int __commit_transaction(struct dm_pool_metadata *pmd)
{
int r;
- size_t metadata_len, data_len;
struct thin_disk_superblock *disk_super;
struct dm_block *sblock;
@@ -797,14 +796,6 @@ static int __commit_transaction(struct dm_pool_metadata *pmd)
if (r < 0)
return r;
- r = dm_sm_root_size(pmd->metadata_sm, &metadata_len);
- if (r < 0)
- return r;
-
- r = dm_sm_root_size(pmd->data_sm, &data_len);
- if (r < 0)
- return r;
-
r = save_sm_roots(pmd);
if (r < 0)
return r;
diff --git a/drivers/md/dm-thin.c b/drivers/md/dm-thin.c
index b11107497d2e..b900723bbd0f 100644
--- a/drivers/md/dm-thin.c
+++ b/drivers/md/dm-thin.c
@@ -240,9 +240,9 @@ struct pool {
struct dm_bio_prison *prison;
struct dm_kcopyd_client *copier;
+ struct work_struct worker;
struct workqueue_struct *wq;
struct throttle throttle;
- struct work_struct worker;
struct delayed_work waker;
struct delayed_work no_space_timeout;
@@ -260,7 +260,6 @@ struct pool {
struct dm_deferred_set *all_io_ds;
struct dm_thin_new_mapping *next_mapping;
- mempool_t *mapping_pool;
process_bio_fn process_bio;
process_bio_fn process_discard;
@@ -273,6 +272,8 @@ struct pool {
process_mapping_fn process_prepared_discard_pt2;
struct dm_bio_prison_cell **cell_sort_array;
+
+ mempool_t mapping_pool;
};
static enum pool_mode get_pool_mode(struct pool *pool);
@@ -917,7 +918,7 @@ static void process_prepared_mapping_fail(struct dm_thin_new_mapping *m)
{
cell_error(m->tc->pool, m->cell);
list_del(&m->list);
- mempool_free(m, m->tc->pool->mapping_pool);
+ mempool_free(m, &m->tc->pool->mapping_pool);
}
static void process_prepared_mapping(struct dm_thin_new_mapping *m)
@@ -961,7 +962,7 @@ static void process_prepared_mapping(struct dm_thin_new_mapping *m)
out:
list_del(&m->list);
- mempool_free(m, pool->mapping_pool);
+ mempool_free(m, &pool->mapping_pool);
}
/*----------------------------------------------------------------*/
@@ -971,7 +972,7 @@ static void free_discard_mapping(struct dm_thin_new_mapping *m)
struct thin_c *tc = m->tc;
if (m->cell)
cell_defer_no_holder(tc, m->cell);
- mempool_free(m, tc->pool->mapping_pool);
+ mempool_free(m, &tc->pool->mapping_pool);
}
static void process_prepared_discard_fail(struct dm_thin_new_mapping *m)
@@ -999,7 +1000,7 @@ static void process_prepared_discard_no_passdown(struct dm_thin_new_mapping *m)
bio_endio(m->bio);
cell_defer_no_holder(tc, m->cell);
- mempool_free(m, tc->pool->mapping_pool);
+ mempool_free(m, &tc->pool->mapping_pool);
}
/*----------------------------------------------------------------*/
@@ -1092,7 +1093,7 @@ static void process_prepared_discard_passdown_pt1(struct dm_thin_new_mapping *m)
metadata_operation_failed(pool, "dm_thin_remove_range", r);
bio_io_error(m->bio);
cell_defer_no_holder(tc, m->cell);
- mempool_free(m, pool->mapping_pool);
+ mempool_free(m, &pool->mapping_pool);
return;
}
@@ -1105,7 +1106,7 @@ static void process_prepared_discard_passdown_pt1(struct dm_thin_new_mapping *m)
metadata_operation_failed(pool, "dm_pool_inc_data_range", r);
bio_io_error(m->bio);
cell_defer_no_holder(tc, m->cell);
- mempool_free(m, pool->mapping_pool);
+ mempool_free(m, &pool->mapping_pool);
return;
}
@@ -1150,7 +1151,7 @@ static void process_prepared_discard_passdown_pt2(struct dm_thin_new_mapping *m)
bio_endio(m->bio);
cell_defer_no_holder(tc, m->cell);
- mempool_free(m, pool->mapping_pool);
+ mempool_free(m, &pool->mapping_pool);
}
static void process_prepared(struct pool *pool, struct list_head *head,
@@ -1196,7 +1197,7 @@ static int ensure_next_mapping(struct pool *pool)
if (pool->next_mapping)
return 0;
- pool->next_mapping = mempool_alloc(pool->mapping_pool, GFP_ATOMIC);
+ pool->next_mapping = mempool_alloc(&pool->mapping_pool, GFP_ATOMIC);
return pool->next_mapping ? 0 : -ENOMEM;
}
@@ -1385,6 +1386,8 @@ static void schedule_external_copy(struct thin_c *tc, dm_block_t virt_block,
static void set_pool_mode(struct pool *pool, enum pool_mode new_mode);
+static void requeue_bios(struct pool *pool);
+
static void check_for_space(struct pool *pool)
{
int r;
@@ -1397,8 +1400,10 @@ static void check_for_space(struct pool *pool)
if (r)
return;
- if (nr_free)
+ if (nr_free) {
set_pool_mode(pool, PM_WRITE);
+ requeue_bios(pool);
+ }
}
/*
@@ -1475,7 +1480,10 @@ static int alloc_data_block(struct thin_c *tc, dm_block_t *result)
r = dm_pool_alloc_data_block(pool->pmd, result);
if (r) {
- metadata_operation_failed(pool, "dm_pool_alloc_data_block", r);
+ if (r == -ENOSPC)
+ set_pool_mode(pool, PM_OUT_OF_DATA_SPACE);
+ else
+ metadata_operation_failed(pool, "dm_pool_alloc_data_block", r);
return r;
}
@@ -2835,8 +2843,8 @@ static void __pool_destroy(struct pool *pool)
destroy_workqueue(pool->wq);
if (pool->next_mapping)
- mempool_free(pool->next_mapping, pool->mapping_pool);
- mempool_destroy(pool->mapping_pool);
+ mempool_free(pool->next_mapping, &pool->mapping_pool);
+ mempool_exit(&pool->mapping_pool);
dm_deferred_set_destroy(pool->shared_read_ds);
dm_deferred_set_destroy(pool->all_io_ds);
kfree(pool);
@@ -2861,7 +2869,7 @@ static struct pool *pool_create(struct mapped_device *pool_md,
return (struct pool *)pmd;
}
- pool = kmalloc(sizeof(*pool), GFP_KERNEL);
+ pool = kzalloc(sizeof(*pool), GFP_KERNEL);
if (!pool) {
*error = "Error allocating memory for pool";
err_p = ERR_PTR(-ENOMEM);
@@ -2931,15 +2939,17 @@ static struct pool *pool_create(struct mapped_device *pool_md,
}
pool->next_mapping = NULL;
- pool->mapping_pool = mempool_create_slab_pool(MAPPING_POOL_SIZE,
- _new_mapping_cache);
- if (!pool->mapping_pool) {
+ r = mempool_init_slab_pool(&pool->mapping_pool, MAPPING_POOL_SIZE,
+ _new_mapping_cache);
+ if (r) {
*error = "Error creating pool's mapping mempool";
- err_p = ERR_PTR(-ENOMEM);
+ err_p = ERR_PTR(r);
goto bad_mapping_pool;
}
- pool->cell_sort_array = vmalloc(sizeof(*pool->cell_sort_array) * CELL_SORT_ARRAY_SIZE);
+ pool->cell_sort_array =
+ vmalloc(array_size(CELL_SORT_ARRAY_SIZE,
+ sizeof(*pool->cell_sort_array)));
if (!pool->cell_sort_array) {
*error = "Error allocating cell sort array";
err_p = ERR_PTR(-ENOMEM);
@@ -2955,7 +2965,7 @@ static struct pool *pool_create(struct mapped_device *pool_md,
return pool;
bad_sort_array:
- mempool_destroy(pool->mapping_pool);
+ mempool_exit(&pool->mapping_pool);
bad_mapping_pool:
dm_deferred_set_destroy(pool->all_io_ds);
bad_all_io_ds:
diff --git a/drivers/md/dm-verity-fec.c b/drivers/md/dm-verity-fec.c
index e13f90832b6b..684af08d0747 100644
--- a/drivers/md/dm-verity-fec.c
+++ b/drivers/md/dm-verity-fec.c
@@ -309,13 +309,13 @@ static int fec_alloc_bufs(struct dm_verity *v, struct dm_verity_fec_io *fio)
unsigned n;
if (!fio->rs)
- fio->rs = mempool_alloc(v->fec->rs_pool, GFP_NOIO);
+ fio->rs = mempool_alloc(&v->fec->rs_pool, GFP_NOIO);
fec_for_each_prealloc_buffer(n) {
if (fio->bufs[n])
continue;
- fio->bufs[n] = mempool_alloc(v->fec->prealloc_pool, GFP_NOWAIT);
+ fio->bufs[n] = mempool_alloc(&v->fec->prealloc_pool, GFP_NOWAIT);
if (unlikely(!fio->bufs[n])) {
DMERR("failed to allocate FEC buffer");
return -ENOMEM;
@@ -327,7 +327,7 @@ static int fec_alloc_bufs(struct dm_verity *v, struct dm_verity_fec_io *fio)
if (fio->bufs[n])
continue;
- fio->bufs[n] = mempool_alloc(v->fec->extra_pool, GFP_NOWAIT);
+ fio->bufs[n] = mempool_alloc(&v->fec->extra_pool, GFP_NOWAIT);
/* we can manage with even one buffer if necessary */
if (unlikely(!fio->bufs[n]))
break;
@@ -335,7 +335,7 @@ static int fec_alloc_bufs(struct dm_verity *v, struct dm_verity_fec_io *fio)
fio->nbufs = n;
if (!fio->output)
- fio->output = mempool_alloc(v->fec->output_pool, GFP_NOIO);
+ fio->output = mempool_alloc(&v->fec->output_pool, GFP_NOIO);
return 0;
}
@@ -493,15 +493,15 @@ void verity_fec_finish_io(struct dm_verity_io *io)
if (!verity_fec_is_enabled(io->v))
return;
- mempool_free(fio->rs, f->rs_pool);
+ mempool_free(fio->rs, &f->rs_pool);
fec_for_each_prealloc_buffer(n)
- mempool_free(fio->bufs[n], f->prealloc_pool);
+ mempool_free(fio->bufs[n], &f->prealloc_pool);
fec_for_each_extra_buffer(fio, n)
- mempool_free(fio->bufs[n], f->extra_pool);
+ mempool_free(fio->bufs[n], &f->extra_pool);
- mempool_free(fio->output, f->output_pool);
+ mempool_free(fio->output, &f->output_pool);
}
/*
@@ -549,9 +549,9 @@ void verity_fec_dtr(struct dm_verity *v)
if (!verity_fec_is_enabled(v))
goto out;
- mempool_destroy(f->rs_pool);
- mempool_destroy(f->prealloc_pool);
- mempool_destroy(f->extra_pool);
+ mempool_exit(&f->rs_pool);
+ mempool_exit(&f->prealloc_pool);
+ mempool_exit(&f->extra_pool);
kmem_cache_destroy(f->cache);
if (f->data_bufio)
@@ -570,7 +570,7 @@ static void *fec_rs_alloc(gfp_t gfp_mask, void *pool_data)
{
struct dm_verity *v = (struct dm_verity *)pool_data;
- return init_rs(8, 0x11d, 0, 1, v->fec->roots);
+ return init_rs_gfp(8, 0x11d, 0, 1, v->fec->roots, gfp_mask);
}
static void fec_rs_free(void *element, void *pool_data)
@@ -675,6 +675,7 @@ int verity_fec_ctr(struct dm_verity *v)
struct dm_verity_fec *f = v->fec;
struct dm_target *ti = v->ti;
u64 hash_blocks;
+ int ret;
if (!verity_fec_is_enabled(v)) {
verity_fec_dtr(v);
@@ -770,11 +771,11 @@ int verity_fec_ctr(struct dm_verity *v)
}
/* Preallocate an rs_control structure for each worker thread */
- f->rs_pool = mempool_create(num_online_cpus(), fec_rs_alloc,
- fec_rs_free, (void *) v);
- if (!f->rs_pool) {
+ ret = mempool_init(&f->rs_pool, num_online_cpus(), fec_rs_alloc,
+ fec_rs_free, (void *) v);
+ if (ret) {
ti->error = "Cannot allocate RS pool";
- return -ENOMEM;
+ return ret;
}
f->cache = kmem_cache_create("dm_verity_fec_buffers",
@@ -786,26 +787,26 @@ int verity_fec_ctr(struct dm_verity *v)
}
/* Preallocate DM_VERITY_FEC_BUF_PREALLOC buffers for each thread */
- f->prealloc_pool = mempool_create_slab_pool(num_online_cpus() *
- DM_VERITY_FEC_BUF_PREALLOC,
- f->cache);
- if (!f->prealloc_pool) {
+ ret = mempool_init_slab_pool(&f->prealloc_pool, num_online_cpus() *
+ DM_VERITY_FEC_BUF_PREALLOC,
+ f->cache);
+ if (ret) {
ti->error = "Cannot allocate FEC buffer prealloc pool";
- return -ENOMEM;
+ return ret;
}
- f->extra_pool = mempool_create_slab_pool(0, f->cache);
- if (!f->extra_pool) {
+ ret = mempool_init_slab_pool(&f->extra_pool, 0, f->cache);
+ if (ret) {
ti->error = "Cannot allocate FEC buffer extra pool";
- return -ENOMEM;
+ return ret;
}
/* Preallocate an output buffer for each thread */
- f->output_pool = mempool_create_kmalloc_pool(num_online_cpus(),
- 1 << v->data_dev_block_bits);
- if (!f->output_pool) {
+ ret = mempool_init_kmalloc_pool(&f->output_pool, num_online_cpus(),
+ 1 << v->data_dev_block_bits);
+ if (ret) {
ti->error = "Cannot allocate FEC output pool";
- return -ENOMEM;
+ return ret;
}
/* Reserve space for our per-bio data */
diff --git a/drivers/md/dm-verity-fec.h b/drivers/md/dm-verity-fec.h
index bb31ce87a933..6ad803b2b36c 100644
--- a/drivers/md/dm-verity-fec.h
+++ b/drivers/md/dm-verity-fec.h
@@ -46,10 +46,10 @@ struct dm_verity_fec {
sector_t hash_blocks; /* blocks covered after v->hash_start */
unsigned char roots; /* number of parity bytes, M-N of RS(M, N) */
unsigned char rsn; /* N of RS(M, N) */
- mempool_t *rs_pool; /* mempool for fio->rs */
- mempool_t *prealloc_pool; /* mempool for preallocated buffers */
- mempool_t *extra_pool; /* mempool for extra buffers */
- mempool_t *output_pool; /* mempool for output */
+ mempool_t rs_pool; /* mempool for fio->rs */
+ mempool_t prealloc_pool; /* mempool for preallocated buffers */
+ mempool_t extra_pool; /* mempool for extra buffers */
+ mempool_t output_pool; /* mempool for output */
struct kmem_cache *cache; /* cache for buffers */
};
diff --git a/drivers/md/dm-verity-target.c b/drivers/md/dm-verity-target.c
index fc893f636a98..12decdbd722d 100644
--- a/drivers/md/dm-verity-target.c
+++ b/drivers/md/dm-verity-target.c
@@ -797,8 +797,9 @@ static int verity_alloc_most_once(struct dm_verity *v)
return -E2BIG;
}
- v->validated_blocks = kvzalloc(BITS_TO_LONGS(v->data_blocks) *
- sizeof(unsigned long), GFP_KERNEL);
+ v->validated_blocks = kvcalloc(BITS_TO_LONGS(v->data_blocks),
+ sizeof(unsigned long),
+ GFP_KERNEL);
if (!v->validated_blocks) {
ti->error = "failed to allocate bitset for check_at_most_once";
return -ENOMEM;
diff --git a/drivers/md/dm-writecache.c b/drivers/md/dm-writecache.c
new file mode 100644
index 000000000000..07ea6a48aac6
--- /dev/null
+++ b/drivers/md/dm-writecache.c
@@ -0,0 +1,2305 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * Copyright (C) 2018 Red Hat. All rights reserved.
+ *
+ * This file is released under the GPL.
+ */
+
+#include <linux/device-mapper.h>
+#include <linux/module.h>
+#include <linux/init.h>
+#include <linux/vmalloc.h>
+#include <linux/kthread.h>
+#include <linux/dm-io.h>
+#include <linux/dm-kcopyd.h>
+#include <linux/dax.h>
+#include <linux/pfn_t.h>
+#include <linux/libnvdimm.h>
+
+#define DM_MSG_PREFIX "writecache"
+
+#define HIGH_WATERMARK 50
+#define LOW_WATERMARK 45
+#define MAX_WRITEBACK_JOBS 0
+#define ENDIO_LATENCY 16
+#define WRITEBACK_LATENCY 64
+#define AUTOCOMMIT_BLOCKS_SSD 65536
+#define AUTOCOMMIT_BLOCKS_PMEM 64
+#define AUTOCOMMIT_MSEC 1000
+
+#define BITMAP_GRANULARITY 65536
+#if BITMAP_GRANULARITY < PAGE_SIZE
+#undef BITMAP_GRANULARITY
+#define BITMAP_GRANULARITY PAGE_SIZE
+#endif
+
+#if IS_ENABLED(CONFIG_ARCH_HAS_PMEM_API) && IS_ENABLED(CONFIG_DAX_DRIVER)
+#define DM_WRITECACHE_HAS_PMEM
+#endif
+
+#ifdef DM_WRITECACHE_HAS_PMEM
+#define pmem_assign(dest, src) \
+do { \
+ typeof(dest) uniq = (src); \
+ memcpy_flushcache(&(dest), &uniq, sizeof(dest)); \
+} while (0)
+#else
+#define pmem_assign(dest, src) ((dest) = (src))
+#endif
+
+#if defined(__HAVE_ARCH_MEMCPY_MCSAFE) && defined(DM_WRITECACHE_HAS_PMEM)
+#define DM_WRITECACHE_HANDLE_HARDWARE_ERRORS
+#endif
+
+#define MEMORY_SUPERBLOCK_MAGIC 0x23489321
+#define MEMORY_SUPERBLOCK_VERSION 1
+
+struct wc_memory_entry {
+ __le64 original_sector;
+ __le64 seq_count;
+};
+
+struct wc_memory_superblock {
+ union {
+ struct {
+ __le32 magic;
+ __le32 version;
+ __le32 block_size;
+ __le32 pad;
+ __le64 n_blocks;
+ __le64 seq_count;
+ };
+ __le64 padding[8];
+ };
+ struct wc_memory_entry entries[0];
+};
+
+struct wc_entry {
+ struct rb_node rb_node;
+ struct list_head lru;
+ unsigned short wc_list_contiguous;
+ bool write_in_progress
+#if BITS_PER_LONG == 64
+ :1
+#endif
+ ;
+ unsigned long index
+#if BITS_PER_LONG == 64
+ :47
+#endif
+ ;
+#ifdef DM_WRITECACHE_HANDLE_HARDWARE_ERRORS
+ uint64_t original_sector;
+ uint64_t seq_count;
+#endif
+};
+
+#ifdef DM_WRITECACHE_HAS_PMEM
+#define WC_MODE_PMEM(wc) ((wc)->pmem_mode)
+#define WC_MODE_FUA(wc) ((wc)->writeback_fua)
+#else
+#define WC_MODE_PMEM(wc) false
+#define WC_MODE_FUA(wc) false
+#endif
+#define WC_MODE_SORT_FREELIST(wc) (!WC_MODE_PMEM(wc))
+
+struct dm_writecache {
+ struct mutex lock;
+ struct list_head lru;
+ union {
+ struct list_head freelist;
+ struct {
+ struct rb_root freetree;
+ struct wc_entry *current_free;
+ };
+ };
+ struct rb_root tree;
+
+ size_t freelist_size;
+ size_t writeback_size;
+ size_t freelist_high_watermark;
+ size_t freelist_low_watermark;
+
+ unsigned uncommitted_blocks;
+ unsigned autocommit_blocks;
+ unsigned max_writeback_jobs;
+
+ int error;
+
+ unsigned long autocommit_jiffies;
+ struct timer_list autocommit_timer;
+ struct wait_queue_head freelist_wait;
+
+ atomic_t bio_in_progress[2];
+ struct wait_queue_head bio_in_progress_wait[2];
+
+ struct dm_target *ti;
+ struct dm_dev *dev;
+ struct dm_dev *ssd_dev;
+ void *memory_map;
+ uint64_t memory_map_size;
+ size_t metadata_sectors;
+ size_t n_blocks;
+ uint64_t seq_count;
+ void *block_start;
+ struct wc_entry *entries;
+ unsigned block_size;
+ unsigned char block_size_bits;
+
+ bool pmem_mode:1;
+ bool writeback_fua:1;
+
+ bool overwrote_committed:1;
+ bool memory_vmapped:1;
+
+ bool high_wm_percent_set:1;
+ bool low_wm_percent_set:1;
+ bool max_writeback_jobs_set:1;
+ bool autocommit_blocks_set:1;
+ bool autocommit_time_set:1;
+ bool writeback_fua_set:1;
+ bool flush_on_suspend:1;
+
+ unsigned writeback_all;
+ struct workqueue_struct *writeback_wq;
+ struct work_struct writeback_work;
+ struct work_struct flush_work;
+
+ struct dm_io_client *dm_io;
+
+ raw_spinlock_t endio_list_lock;
+ struct list_head endio_list;
+ struct task_struct *endio_thread;
+
+ struct task_struct *flush_thread;
+ struct bio_list flush_list;
+
+ struct dm_kcopyd_client *dm_kcopyd;
+ unsigned long *dirty_bitmap;
+ unsigned dirty_bitmap_size;
+
+ struct bio_set bio_set;
+ mempool_t copy_pool;
+};
+
+#define WB_LIST_INLINE 16
+
+struct writeback_struct {
+ struct list_head endio_entry;
+ struct dm_writecache *wc;
+ struct wc_entry **wc_list;
+ unsigned wc_list_n;
+ unsigned page_offset;
+ struct page *page;
+ struct wc_entry *wc_list_inline[WB_LIST_INLINE];
+ struct bio bio;
+};
+
+struct copy_struct {
+ struct list_head endio_entry;
+ struct dm_writecache *wc;
+ struct wc_entry *e;
+ unsigned n_entries;
+ int error;
+};
+
+DECLARE_DM_KCOPYD_THROTTLE_WITH_MODULE_PARM(dm_writecache_throttle,
+ "A percentage of time allocated for data copying");
+
+static void wc_lock(struct dm_writecache *wc)
+{
+ mutex_lock(&wc->lock);
+}
+
+static void wc_unlock(struct dm_writecache *wc)
+{
+ mutex_unlock(&wc->lock);
+}
+
+#ifdef DM_WRITECACHE_HAS_PMEM
+static int persistent_memory_claim(struct dm_writecache *wc)
+{
+ int r;
+ loff_t s;
+ long p, da;
+ pfn_t pfn;
+ int id;
+ struct page **pages;
+
+ wc->memory_vmapped = false;
+
+ if (!wc->ssd_dev->dax_dev) {
+ r = -EOPNOTSUPP;
+ goto err1;
+ }
+ s = wc->memory_map_size;
+ p = s >> PAGE_SHIFT;
+ if (!p) {
+ r = -EINVAL;
+ goto err1;
+ }
+ if (p != s >> PAGE_SHIFT) {
+ r = -EOVERFLOW;
+ goto err1;
+ }
+
+ id = dax_read_lock();
+
+ da = dax_direct_access(wc->ssd_dev->dax_dev, 0, p, &wc->memory_map, &pfn);
+ if (da < 0) {
+ wc->memory_map = NULL;
+ r = da;
+ goto err2;
+ }
+ if (!pfn_t_has_page(pfn)) {
+ wc->memory_map = NULL;
+ r = -EOPNOTSUPP;
+ goto err2;
+ }
+ if (da != p) {
+ long i;
+ wc->memory_map = NULL;
+ pages = kvmalloc_array(p, sizeof(struct page *), GFP_KERNEL);
+ if (!pages) {
+ r = -ENOMEM;
+ goto err2;
+ }
+ i = 0;
+ do {
+ long daa;
+ void *dummy_addr;
+ daa = dax_direct_access(wc->ssd_dev->dax_dev, i, p - i,
+ &dummy_addr, &pfn);
+ if (daa <= 0) {
+ r = daa ? daa : -EINVAL;
+ goto err3;
+ }
+ if (!pfn_t_has_page(pfn)) {
+ r = -EOPNOTSUPP;
+ goto err3;
+ }
+ while (daa-- && i < p) {
+ pages[i++] = pfn_t_to_page(pfn);
+ pfn.val++;
+ }
+ } while (i < p);
+ wc->memory_map = vmap(pages, p, VM_MAP, PAGE_KERNEL);
+ if (!wc->memory_map) {
+ r = -ENOMEM;
+ goto err3;
+ }
+ kvfree(pages);
+ wc->memory_vmapped = true;
+ }
+
+ dax_read_unlock(id);
+ return 0;
+err3:
+ kvfree(pages);
+err2:
+ dax_read_unlock(id);
+err1:
+ return r;
+}
+#else
+static int persistent_memory_claim(struct dm_writecache *wc)
+{
+ BUG();
+}
+#endif
+
+static void persistent_memory_release(struct dm_writecache *wc)
+{
+ if (wc->memory_vmapped)
+ vunmap(wc->memory_map);
+}
+
+static struct page *persistent_memory_page(void *addr)
+{
+ if (is_vmalloc_addr(addr))
+ return vmalloc_to_page(addr);
+ else
+ return virt_to_page(addr);
+}
+
+static unsigned persistent_memory_page_offset(void *addr)
+{
+ return (unsigned long)addr & (PAGE_SIZE - 1);
+}
+
+static void persistent_memory_flush_cache(void *ptr, size_t size)
+{
+ if (is_vmalloc_addr(ptr))
+ flush_kernel_vmap_range(ptr, size);
+}
+
+static void persistent_memory_invalidate_cache(void *ptr, size_t size)
+{
+ if (is_vmalloc_addr(ptr))
+ invalidate_kernel_vmap_range(ptr, size);
+}
+
+static struct wc_memory_superblock *sb(struct dm_writecache *wc)
+{
+ return wc->memory_map;
+}
+
+static struct wc_memory_entry *memory_entry(struct dm_writecache *wc, struct wc_entry *e)
+{
+ if (is_power_of_2(sizeof(struct wc_entry)) && 0)
+ return &sb(wc)->entries[e - wc->entries];
+ else
+ return &sb(wc)->entries[e->index];
+}
+
+static void *memory_data(struct dm_writecache *wc, struct wc_entry *e)
+{
+ return (char *)wc->block_start + (e->index << wc->block_size_bits);
+}
+
+static sector_t cache_sector(struct dm_writecache *wc, struct wc_entry *e)
+{
+ return wc->metadata_sectors +
+ ((sector_t)e->index << (wc->block_size_bits - SECTOR_SHIFT));
+}
+
+static uint64_t read_original_sector(struct dm_writecache *wc, struct wc_entry *e)
+{
+#ifdef DM_WRITECACHE_HANDLE_HARDWARE_ERRORS
+ return e->original_sector;
+#else
+ return le64_to_cpu(memory_entry(wc, e)->original_sector);
+#endif
+}
+
+static uint64_t read_seq_count(struct dm_writecache *wc, struct wc_entry *e)
+{
+#ifdef DM_WRITECACHE_HANDLE_HARDWARE_ERRORS
+ return e->seq_count;
+#else
+ return le64_to_cpu(memory_entry(wc, e)->seq_count);
+#endif
+}
+
+static void clear_seq_count(struct dm_writecache *wc, struct wc_entry *e)
+{
+#ifdef DM_WRITECACHE_HANDLE_HARDWARE_ERRORS
+ e->seq_count = -1;
+#endif
+ pmem_assign(memory_entry(wc, e)->seq_count, cpu_to_le64(-1));
+}
+
+static void write_original_sector_seq_count(struct dm_writecache *wc, struct wc_entry *e,
+ uint64_t original_sector, uint64_t seq_count)
+{
+ struct wc_memory_entry me;
+#ifdef DM_WRITECACHE_HANDLE_HARDWARE_ERRORS
+ e->original_sector = original_sector;
+ e->seq_count = seq_count;
+#endif
+ me.original_sector = cpu_to_le64(original_sector);
+ me.seq_count = cpu_to_le64(seq_count);
+ pmem_assign(*memory_entry(wc, e), me);
+}
+
+#define writecache_error(wc, err, msg, arg...) \
+do { \
+ if (!cmpxchg(&(wc)->error, 0, err)) \
+ DMERR(msg, ##arg); \
+ wake_up(&(wc)->freelist_wait); \
+} while (0)
+
+#define writecache_has_error(wc) (unlikely(READ_ONCE((wc)->error)))
+
+static void writecache_flush_all_metadata(struct dm_writecache *wc)
+{
+ if (!WC_MODE_PMEM(wc))
+ memset(wc->dirty_bitmap, -1, wc->dirty_bitmap_size);
+}
+
+static void writecache_flush_region(struct dm_writecache *wc, void *ptr, size_t size)
+{
+ if (!WC_MODE_PMEM(wc))
+ __set_bit(((char *)ptr - (char *)wc->memory_map) / BITMAP_GRANULARITY,
+ wc->dirty_bitmap);
+}
+
+static void writecache_disk_flush(struct dm_writecache *wc, struct dm_dev *dev);
+
+struct io_notify {
+ struct dm_writecache *wc;
+ struct completion c;
+ atomic_t count;
+};
+
+static void writecache_notify_io(unsigned long error, void *context)
+{
+ struct io_notify *endio = context;
+
+ if (unlikely(error != 0))
+ writecache_error(endio->wc, -EIO, "error writing metadata");
+ BUG_ON(atomic_read(&endio->count) <= 0);
+ if (atomic_dec_and_test(&endio->count))
+ complete(&endio->c);
+}
+
+static void ssd_commit_flushed(struct dm_writecache *wc)
+{
+ struct dm_io_region region;
+ struct dm_io_request req;
+ struct io_notify endio = {
+ wc,
+ COMPLETION_INITIALIZER_ONSTACK(endio.c),
+ ATOMIC_INIT(1),
+ };
+ unsigned bitmap_bits = wc->dirty_bitmap_size * BITS_PER_LONG;
+ unsigned i = 0;
+
+ while (1) {
+ unsigned j;
+ i = find_next_bit(wc->dirty_bitmap, bitmap_bits, i);
+ if (unlikely(i == bitmap_bits))
+ break;
+ j = find_next_zero_bit(wc->dirty_bitmap, bitmap_bits, i);
+
+ region.bdev = wc->ssd_dev->bdev;
+ region.sector = (sector_t)i * (BITMAP_GRANULARITY >> SECTOR_SHIFT);
+ region.count = (sector_t)(j - i) * (BITMAP_GRANULARITY >> SECTOR_SHIFT);
+
+ if (unlikely(region.sector >= wc->metadata_sectors))
+ break;
+ if (unlikely(region.sector + region.count > wc->metadata_sectors))
+ region.count = wc->metadata_sectors - region.sector;
+
+ atomic_inc(&endio.count);
+ req.bi_op = REQ_OP_WRITE;
+ req.bi_op_flags = REQ_SYNC;
+ req.mem.type = DM_IO_VMA;
+ req.mem.ptr.vma = (char *)wc->memory_map + (size_t)i * BITMAP_GRANULARITY;
+ req.client = wc->dm_io;
+ req.notify.fn = writecache_notify_io;
+ req.notify.context = &endio;
+
+ /* writing via async dm-io (implied by notify.fn above) won't return an error */
+ (void) dm_io(&req, 1, &region, NULL);
+ i = j;
+ }
+
+ writecache_notify_io(0, &endio);
+ wait_for_completion_io(&endio.c);
+
+ writecache_disk_flush(wc, wc->ssd_dev);
+
+ memset(wc->dirty_bitmap, 0, wc->dirty_bitmap_size);
+}
+
+static void writecache_commit_flushed(struct dm_writecache *wc)
+{
+ if (WC_MODE_PMEM(wc))
+ wmb();
+ else
+ ssd_commit_flushed(wc);
+}
+
+static void writecache_disk_flush(struct dm_writecache *wc, struct dm_dev *dev)
+{
+ int r;
+ struct dm_io_region region;
+ struct dm_io_request req;
+
+ region.bdev = dev->bdev;
+ region.sector = 0;
+ region.count = 0;
+ req.bi_op = REQ_OP_WRITE;
+ req.bi_op_flags = REQ_PREFLUSH;
+ req.mem.type = DM_IO_KMEM;
+ req.mem.ptr.addr = NULL;
+ req.client = wc->dm_io;
+ req.notify.fn = NULL;
+
+ r = dm_io(&req, 1, &region, NULL);
+ if (unlikely(r))
+ writecache_error(wc, r, "error flushing metadata: %d", r);
+}
+
+static void writecache_wait_for_ios(struct dm_writecache *wc, int direction)
+{
+ wait_event(wc->bio_in_progress_wait[direction],
+ !atomic_read(&wc->bio_in_progress[direction]));
+}
+
+#define WFE_RETURN_FOLLOWING 1
+#define WFE_LOWEST_SEQ 2
+
+static struct wc_entry *writecache_find_entry(struct dm_writecache *wc,
+ uint64_t block, int flags)
+{
+ struct wc_entry *e;
+ struct rb_node *node = wc->tree.rb_node;
+
+ if (unlikely(!node))
+ return NULL;
+
+ while (1) {
+ e = container_of(node, struct wc_entry, rb_node);
+ if (read_original_sector(wc, e) == block)
+ break;
+ node = (read_original_sector(wc, e) >= block ?
+ e->rb_node.rb_left : e->rb_node.rb_right);
+ if (unlikely(!node)) {
+ if (!(flags & WFE_RETURN_FOLLOWING)) {
+ return NULL;
+ }
+ if (read_original_sector(wc, e) >= block) {
+ break;
+ } else {
+ node = rb_next(&e->rb_node);
+ if (unlikely(!node)) {
+ return NULL;
+ }
+ e = container_of(node, struct wc_entry, rb_node);
+ break;
+ }
+ }
+ }
+
+ while (1) {
+ struct wc_entry *e2;
+ if (flags & WFE_LOWEST_SEQ)
+ node = rb_prev(&e->rb_node);
+ else
+ node = rb_next(&e->rb_node);
+ if (!node)
+ return e;
+ e2 = container_of(node, struct wc_entry, rb_node);
+ if (read_original_sector(wc, e2) != block)
+ return e;
+ e = e2;
+ }
+}
+
+static void writecache_insert_entry(struct dm_writecache *wc, struct wc_entry *ins)
+{
+ struct wc_entry *e;
+ struct rb_node **node = &wc->tree.rb_node, *parent = NULL;
+
+ while (*node) {
+ e = container_of(*node, struct wc_entry, rb_node);
+ parent = &e->rb_node;
+ if (read_original_sector(wc, e) > read_original_sector(wc, ins))
+ node = &parent->rb_left;
+ else
+ node = &parent->rb_right;
+ }
+ rb_link_node(&ins->rb_node, parent, node);
+ rb_insert_color(&ins->rb_node, &wc->tree);
+ list_add(&ins->lru, &wc->lru);
+}
+
+static void writecache_unlink(struct dm_writecache *wc, struct wc_entry *e)
+{
+ list_del(&e->lru);
+ rb_erase(&e->rb_node, &wc->tree);
+}
+
+static void writecache_add_to_freelist(struct dm_writecache *wc, struct wc_entry *e)
+{
+ if (WC_MODE_SORT_FREELIST(wc)) {
+ struct rb_node **node = &wc->freetree.rb_node, *parent = NULL;
+ if (unlikely(!*node))
+ wc->current_free = e;
+ while (*node) {
+ parent = *node;
+ if (&e->rb_node < *node)
+ node = &parent->rb_left;
+ else
+ node = &parent->rb_right;
+ }
+ rb_link_node(&e->rb_node, parent, node);
+ rb_insert_color(&e->rb_node, &wc->freetree);
+ } else {
+ list_add_tail(&e->lru, &wc->freelist);
+ }
+ wc->freelist_size++;
+}
+
+static struct wc_entry *writecache_pop_from_freelist(struct dm_writecache *wc)
+{
+ struct wc_entry *e;
+
+ if (WC_MODE_SORT_FREELIST(wc)) {
+ struct rb_node *next;
+ if (unlikely(!wc->current_free))
+ return NULL;
+ e = wc->current_free;
+ next = rb_next(&e->rb_node);
+ rb_erase(&e->rb_node, &wc->freetree);
+ if (unlikely(!next))
+ next = rb_first(&wc->freetree);
+ wc->current_free = next ? container_of(next, struct wc_entry, rb_node) : NULL;
+ } else {
+ if (unlikely(list_empty(&wc->freelist)))
+ return NULL;
+ e = container_of(wc->freelist.next, struct wc_entry, lru);
+ list_del(&e->lru);
+ }
+ wc->freelist_size--;
+ if (unlikely(wc->freelist_size + wc->writeback_size <= wc->freelist_high_watermark))
+ queue_work(wc->writeback_wq, &wc->writeback_work);
+
+ return e;
+}
+
+static void writecache_free_entry(struct dm_writecache *wc, struct wc_entry *e)
+{
+ writecache_unlink(wc, e);
+ writecache_add_to_freelist(wc, e);
+ clear_seq_count(wc, e);
+ writecache_flush_region(wc, memory_entry(wc, e), sizeof(struct wc_memory_entry));
+ if (unlikely(waitqueue_active(&wc->freelist_wait)))
+ wake_up(&wc->freelist_wait);
+}
+
+static void writecache_wait_on_freelist(struct dm_writecache *wc)
+{
+ DEFINE_WAIT(wait);
+
+ prepare_to_wait(&wc->freelist_wait, &wait, TASK_UNINTERRUPTIBLE);
+ wc_unlock(wc);
+ io_schedule();
+ finish_wait(&wc->freelist_wait, &wait);
+ wc_lock(wc);
+}
+
+static void writecache_poison_lists(struct dm_writecache *wc)
+{
+ /*
+ * Catch incorrect access to these values while the device is suspended.
+ */
+ memset(&wc->tree, -1, sizeof wc->tree);
+ wc->lru.next = LIST_POISON1;
+ wc->lru.prev = LIST_POISON2;
+ wc->freelist.next = LIST_POISON1;
+ wc->freelist.prev = LIST_POISON2;
+}
+
+static void writecache_flush_entry(struct dm_writecache *wc, struct wc_entry *e)
+{
+ writecache_flush_region(wc, memory_entry(wc, e), sizeof(struct wc_memory_entry));
+ if (WC_MODE_PMEM(wc))
+ writecache_flush_region(wc, memory_data(wc, e), wc->block_size);
+}
+
+static bool writecache_entry_is_committed(struct dm_writecache *wc, struct wc_entry *e)
+{
+ return read_seq_count(wc, e) < wc->seq_count;
+}
+
+static void writecache_flush(struct dm_writecache *wc)
+{
+ struct wc_entry *e, *e2;
+ bool need_flush_after_free;
+
+ wc->uncommitted_blocks = 0;
+ del_timer(&wc->autocommit_timer);
+
+ if (list_empty(&wc->lru))
+ return;
+
+ e = container_of(wc->lru.next, struct wc_entry, lru);
+ if (writecache_entry_is_committed(wc, e)) {
+ if (wc->overwrote_committed) {
+ writecache_wait_for_ios(wc, WRITE);
+ writecache_disk_flush(wc, wc->ssd_dev);
+ wc->overwrote_committed = false;
+ }
+ return;
+ }
+ while (1) {
+ writecache_flush_entry(wc, e);
+ if (unlikely(e->lru.next == &wc->lru))
+ break;
+ e2 = container_of(e->lru.next, struct wc_entry, lru);
+ if (writecache_entry_is_committed(wc, e2))
+ break;
+ e = e2;
+ cond_resched();
+ }
+ writecache_commit_flushed(wc);
+
+ writecache_wait_for_ios(wc, WRITE);
+
+ wc->seq_count++;
+ pmem_assign(sb(wc)->seq_count, cpu_to_le64(wc->seq_count));
+ writecache_flush_region(wc, &sb(wc)->seq_count, sizeof sb(wc)->seq_count);
+ writecache_commit_flushed(wc);
+
+ wc->overwrote_committed = false;
+
+ need_flush_after_free = false;
+ while (1) {
+ /* Free another committed entry with lower seq-count */
+ struct rb_node *rb_node = rb_prev(&e->rb_node);
+
+ if (rb_node) {
+ e2 = container_of(rb_node, struct wc_entry, rb_node);
+ if (read_original_sector(wc, e2) == read_original_sector(wc, e) &&
+ likely(!e2->write_in_progress)) {
+ writecache_free_entry(wc, e2);
+ need_flush_after_free = true;
+ }
+ }
+ if (unlikely(e->lru.prev == &wc->lru))
+ break;
+ e = container_of(e->lru.prev, struct wc_entry, lru);
+ cond_resched();
+ }
+
+ if (need_flush_after_free)
+ writecache_commit_flushed(wc);
+}
+
+static void writecache_flush_work(struct work_struct *work)
+{
+ struct dm_writecache *wc = container_of(work, struct dm_writecache, flush_work);
+
+ wc_lock(wc);
+ writecache_flush(wc);
+ wc_unlock(wc);
+}
+
+static void writecache_autocommit_timer(struct timer_list *t)
+{
+ struct dm_writecache *wc = from_timer(wc, t, autocommit_timer);
+ if (!writecache_has_error(wc))
+ queue_work(wc->writeback_wq, &wc->flush_work);
+}
+
+static void writecache_schedule_autocommit(struct dm_writecache *wc)
+{
+ if (!timer_pending(&wc->autocommit_timer))
+ mod_timer(&wc->autocommit_timer, jiffies + wc->autocommit_jiffies);
+}
+
+static void writecache_discard(struct dm_writecache *wc, sector_t start, sector_t end)
+{
+ struct wc_entry *e;
+ bool discarded_something = false;
+
+ e = writecache_find_entry(wc, start, WFE_RETURN_FOLLOWING | WFE_LOWEST_SEQ);
+ if (unlikely(!e))
+ return;
+
+ while (read_original_sector(wc, e) < end) {
+ struct rb_node *node = rb_next(&e->rb_node);
+
+ if (likely(!e->write_in_progress)) {
+ if (!discarded_something) {
+ writecache_wait_for_ios(wc, READ);
+ writecache_wait_for_ios(wc, WRITE);
+ discarded_something = true;
+ }
+ writecache_free_entry(wc, e);
+ }
+
+ if (!node)
+ break;
+
+ e = container_of(node, struct wc_entry, rb_node);
+ }
+
+ if (discarded_something)
+ writecache_commit_flushed(wc);
+}
+
+static bool writecache_wait_for_writeback(struct dm_writecache *wc)
+{
+ if (wc->writeback_size) {
+ writecache_wait_on_freelist(wc);
+ return true;
+ }
+ return false;
+}
+
+static void writecache_suspend(struct dm_target *ti)
+{
+ struct dm_writecache *wc = ti->private;
+ bool flush_on_suspend;
+
+ del_timer_sync(&wc->autocommit_timer);
+
+ wc_lock(wc);
+ writecache_flush(wc);
+ flush_on_suspend = wc->flush_on_suspend;
+ if (flush_on_suspend) {
+ wc->flush_on_suspend = false;
+ wc->writeback_all++;
+ queue_work(wc->writeback_wq, &wc->writeback_work);
+ }
+ wc_unlock(wc);
+
+ flush_workqueue(wc->writeback_wq);
+
+ wc_lock(wc);
+ if (flush_on_suspend)
+ wc->writeback_all--;
+ while (writecache_wait_for_writeback(wc));
+
+ if (WC_MODE_PMEM(wc))
+ persistent_memory_flush_cache(wc->memory_map, wc->memory_map_size);
+
+ writecache_poison_lists(wc);
+
+ wc_unlock(wc);
+}
+
+static int writecache_alloc_entries(struct dm_writecache *wc)
+{
+ size_t b;
+
+ if (wc->entries)
+ return 0;
+ wc->entries = vmalloc(array_size(sizeof(struct wc_entry), wc->n_blocks));
+ if (!wc->entries)
+ return -ENOMEM;
+ for (b = 0; b < wc->n_blocks; b++) {
+ struct wc_entry *e = &wc->entries[b];
+ e->index = b;
+ e->write_in_progress = false;
+ }
+
+ return 0;
+}
+
+static void writecache_resume(struct dm_target *ti)
+{
+ struct dm_writecache *wc = ti->private;
+ size_t b;
+ bool need_flush = false;
+ __le64 sb_seq_count;
+ int r;
+
+ wc_lock(wc);
+
+ if (WC_MODE_PMEM(wc))
+ persistent_memory_invalidate_cache(wc->memory_map, wc->memory_map_size);
+
+ wc->tree = RB_ROOT;
+ INIT_LIST_HEAD(&wc->lru);
+ if (WC_MODE_SORT_FREELIST(wc)) {
+ wc->freetree = RB_ROOT;
+ wc->current_free = NULL;
+ } else {
+ INIT_LIST_HEAD(&wc->freelist);
+ }
+ wc->freelist_size = 0;
+
+ r = memcpy_mcsafe(&sb_seq_count, &sb(wc)->seq_count, sizeof(uint64_t));
+ if (r) {
+ writecache_error(wc, r, "hardware memory error when reading superblock: %d", r);
+ sb_seq_count = cpu_to_le64(0);
+ }
+ wc->seq_count = le64_to_cpu(sb_seq_count);
+
+#ifdef DM_WRITECACHE_HANDLE_HARDWARE_ERRORS
+ for (b = 0; b < wc->n_blocks; b++) {
+ struct wc_entry *e = &wc->entries[b];
+ struct wc_memory_entry wme;
+ if (writecache_has_error(wc)) {
+ e->original_sector = -1;
+ e->seq_count = -1;
+ continue;
+ }
+ r = memcpy_mcsafe(&wme, memory_entry(wc, e), sizeof(struct wc_memory_entry));
+ if (r) {
+ writecache_error(wc, r, "hardware memory error when reading metadata entry %lu: %d",
+ (unsigned long)b, r);
+ e->original_sector = -1;
+ e->seq_count = -1;
+ } else {
+ e->original_sector = le64_to_cpu(wme.original_sector);
+ e->seq_count = le64_to_cpu(wme.seq_count);
+ }
+ }
+#endif
+ for (b = 0; b < wc->n_blocks; b++) {
+ struct wc_entry *e = &wc->entries[b];
+ if (!writecache_entry_is_committed(wc, e)) {
+ if (read_seq_count(wc, e) != -1) {
+erase_this:
+ clear_seq_count(wc, e);
+ need_flush = true;
+ }
+ writecache_add_to_freelist(wc, e);
+ } else {
+ struct wc_entry *old;
+
+ old = writecache_find_entry(wc, read_original_sector(wc, e), 0);
+ if (!old) {
+ writecache_insert_entry(wc, e);
+ } else {
+ if (read_seq_count(wc, old) == read_seq_count(wc, e)) {
+ writecache_error(wc, -EINVAL,
+ "two identical entries, position %llu, sector %llu, sequence %llu",
+ (unsigned long long)b, (unsigned long long)read_original_sector(wc, e),
+ (unsigned long long)read_seq_count(wc, e));
+ }
+ if (read_seq_count(wc, old) > read_seq_count(wc, e)) {
+ goto erase_this;
+ } else {
+ writecache_free_entry(wc, old);
+ writecache_insert_entry(wc, e);
+ need_flush = true;
+ }
+ }
+ }
+ cond_resched();
+ }
+
+ if (need_flush) {
+ writecache_flush_all_metadata(wc);
+ writecache_commit_flushed(wc);
+ }
+
+ wc_unlock(wc);
+}
+
+static int process_flush_mesg(unsigned argc, char **argv, struct dm_writecache *wc)
+{
+ if (argc != 1)
+ return -EINVAL;
+
+ wc_lock(wc);
+ if (dm_suspended(wc->ti)) {
+ wc_unlock(wc);
+ return -EBUSY;
+ }
+ if (writecache_has_error(wc)) {
+ wc_unlock(wc);
+ return -EIO;
+ }
+
+ writecache_flush(wc);
+ wc->writeback_all++;
+ queue_work(wc->writeback_wq, &wc->writeback_work);
+ wc_unlock(wc);
+
+ flush_workqueue(wc->writeback_wq);
+
+ wc_lock(wc);
+ wc->writeback_all--;
+ if (writecache_has_error(wc)) {
+ wc_unlock(wc);
+ return -EIO;
+ }
+ wc_unlock(wc);
+
+ return 0;
+}
+
+static int process_flush_on_suspend_mesg(unsigned argc, char **argv, struct dm_writecache *wc)
+{
+ if (argc != 1)
+ return -EINVAL;
+
+ wc_lock(wc);
+ wc->flush_on_suspend = true;
+ wc_unlock(wc);
+
+ return 0;
+}
+
+static int writecache_message(struct dm_target *ti, unsigned argc, char **argv,
+ char *result, unsigned maxlen)
+{
+ int r = -EINVAL;
+ struct dm_writecache *wc = ti->private;
+
+ if (!strcasecmp(argv[0], "flush"))
+ r = process_flush_mesg(argc, argv, wc);
+ else if (!strcasecmp(argv[0], "flush_on_suspend"))
+ r = process_flush_on_suspend_mesg(argc, argv, wc);
+ else
+ DMERR("unrecognised message received: %s", argv[0]);
+
+ return r;
+}
+
+static void bio_copy_block(struct dm_writecache *wc, struct bio *bio, void *data)
+{
+ void *buf;
+ unsigned long flags;
+ unsigned size;
+ int rw = bio_data_dir(bio);
+ unsigned remaining_size = wc->block_size;
+
+ do {
+ struct bio_vec bv = bio_iter_iovec(bio, bio->bi_iter);
+ buf = bvec_kmap_irq(&bv, &flags);
+ size = bv.bv_len;
+ if (unlikely(size > remaining_size))
+ size = remaining_size;
+
+ if (rw == READ) {
+ int r;
+ r = memcpy_mcsafe(buf, data, size);
+ flush_dcache_page(bio_page(bio));
+ if (unlikely(r)) {
+ writecache_error(wc, r, "hardware memory error when reading data: %d", r);
+ bio->bi_status = BLK_STS_IOERR;
+ }
+ } else {
+ flush_dcache_page(bio_page(bio));
+ memcpy_flushcache(data, buf, size);
+ }
+
+ bvec_kunmap_irq(buf, &flags);
+
+ data = (char *)data + size;
+ remaining_size -= size;
+ bio_advance(bio, size);
+ } while (unlikely(remaining_size));
+}
+
+static int writecache_flush_thread(void *data)
+{
+ struct dm_writecache *wc = data;
+
+ while (1) {
+ struct bio *bio;
+
+ wc_lock(wc);
+ bio = bio_list_pop(&wc->flush_list);
+ if (!bio) {
+ set_current_state(TASK_INTERRUPTIBLE);
+ wc_unlock(wc);
+
+ if (unlikely(kthread_should_stop())) {
+ set_current_state(TASK_RUNNING);
+ break;
+ }
+
+ schedule();
+ continue;
+ }
+
+ if (bio_op(bio) == REQ_OP_DISCARD) {
+ writecache_discard(wc, bio->bi_iter.bi_sector,
+ bio_end_sector(bio));
+ wc_unlock(wc);
+ bio_set_dev(bio, wc->dev->bdev);
+ generic_make_request(bio);
+ } else {
+ writecache_flush(wc);
+ wc_unlock(wc);
+ if (writecache_has_error(wc))
+ bio->bi_status = BLK_STS_IOERR;
+ bio_endio(bio);
+ }
+ }
+
+ return 0;
+}
+
+static void writecache_offload_bio(struct dm_writecache *wc, struct bio *bio)
+{
+ if (bio_list_empty(&wc->flush_list))
+ wake_up_process(wc->flush_thread);
+ bio_list_add(&wc->flush_list, bio);
+}
+
+static int writecache_map(struct dm_target *ti, struct bio *bio)
+{
+ struct wc_entry *e;
+ struct dm_writecache *wc = ti->private;
+
+ bio->bi_private = NULL;
+
+ wc_lock(wc);
+
+ if (unlikely(bio->bi_opf & REQ_PREFLUSH)) {
+ if (writecache_has_error(wc))
+ goto unlock_error;
+ if (WC_MODE_PMEM(wc)) {
+ writecache_flush(wc);
+ if (writecache_has_error(wc))
+ goto unlock_error;
+ goto unlock_submit;
+ } else {
+ writecache_offload_bio(wc, bio);
+ goto unlock_return;
+ }
+ }
+
+ bio->bi_iter.bi_sector = dm_target_offset(ti, bio->bi_iter.bi_sector);
+
+ if (unlikely((((unsigned)bio->bi_iter.bi_sector | bio_sectors(bio)) &
+ (wc->block_size / 512 - 1)) != 0)) {
+ DMERR("I/O is not aligned, sector %llu, size %u, block size %u",
+ (unsigned long long)bio->bi_iter.bi_sector,
+ bio->bi_iter.bi_size, wc->block_size);
+ goto unlock_error;
+ }
+
+ if (unlikely(bio_op(bio) == REQ_OP_DISCARD)) {
+ if (writecache_has_error(wc))
+ goto unlock_error;
+ if (WC_MODE_PMEM(wc)) {
+ writecache_discard(wc, bio->bi_iter.bi_sector, bio_end_sector(bio));
+ goto unlock_remap_origin;
+ } else {
+ writecache_offload_bio(wc, bio);
+ goto unlock_return;
+ }
+ }
+
+ if (bio_data_dir(bio) == READ) {
+read_next_block:
+ e = writecache_find_entry(wc, bio->bi_iter.bi_sector, WFE_RETURN_FOLLOWING);
+ if (e && read_original_sector(wc, e) == bio->bi_iter.bi_sector) {
+ if (WC_MODE_PMEM(wc)) {
+ bio_copy_block(wc, bio, memory_data(wc, e));
+ if (bio->bi_iter.bi_size)
+ goto read_next_block;
+ goto unlock_submit;
+ } else {
+ dm_accept_partial_bio(bio, wc->block_size >> SECTOR_SHIFT);
+ bio_set_dev(bio, wc->ssd_dev->bdev);
+ bio->bi_iter.bi_sector = cache_sector(wc, e);
+ if (!writecache_entry_is_committed(wc, e))
+ writecache_wait_for_ios(wc, WRITE);
+ goto unlock_remap;
+ }
+ } else {
+ if (e) {
+ sector_t next_boundary =
+ read_original_sector(wc, e) - bio->bi_iter.bi_sector;
+ if (next_boundary < bio->bi_iter.bi_size >> SECTOR_SHIFT) {
+ dm_accept_partial_bio(bio, next_boundary);
+ }
+ }
+ goto unlock_remap_origin;
+ }
+ } else {
+ do {
+ if (writecache_has_error(wc))
+ goto unlock_error;
+ e = writecache_find_entry(wc, bio->bi_iter.bi_sector, 0);
+ if (e) {
+ if (!writecache_entry_is_committed(wc, e))
+ goto bio_copy;
+ if (!WC_MODE_PMEM(wc) && !e->write_in_progress) {
+ wc->overwrote_committed = true;
+ goto bio_copy;
+ }
+ }
+ e = writecache_pop_from_freelist(wc);
+ if (unlikely(!e)) {
+ writecache_wait_on_freelist(wc);
+ continue;
+ }
+ write_original_sector_seq_count(wc, e, bio->bi_iter.bi_sector, wc->seq_count);
+ writecache_insert_entry(wc, e);
+ wc->uncommitted_blocks++;
+bio_copy:
+ if (WC_MODE_PMEM(wc)) {
+ bio_copy_block(wc, bio, memory_data(wc, e));
+ } else {
+ dm_accept_partial_bio(bio, wc->block_size >> SECTOR_SHIFT);
+ bio_set_dev(bio, wc->ssd_dev->bdev);
+ bio->bi_iter.bi_sector = cache_sector(wc, e);
+ if (unlikely(wc->uncommitted_blocks >= wc->autocommit_blocks)) {
+ wc->uncommitted_blocks = 0;
+ queue_work(wc->writeback_wq, &wc->flush_work);
+ } else {
+ writecache_schedule_autocommit(wc);
+ }
+ goto unlock_remap;
+ }
+ } while (bio->bi_iter.bi_size);
+
+ if (unlikely(wc->uncommitted_blocks >= wc->autocommit_blocks))
+ writecache_flush(wc);
+ else
+ writecache_schedule_autocommit(wc);
+ goto unlock_submit;
+ }
+
+unlock_remap_origin:
+ bio_set_dev(bio, wc->dev->bdev);
+ wc_unlock(wc);
+ return DM_MAPIO_REMAPPED;
+
+unlock_remap:
+ /* make sure that writecache_end_io decrements bio_in_progress: */
+ bio->bi_private = (void *)1;
+ atomic_inc(&wc->bio_in_progress[bio_data_dir(bio)]);
+ wc_unlock(wc);
+ return DM_MAPIO_REMAPPED;
+
+unlock_submit:
+ wc_unlock(wc);
+ bio_endio(bio);
+ return DM_MAPIO_SUBMITTED;
+
+unlock_return:
+ wc_unlock(wc);
+ return DM_MAPIO_SUBMITTED;
+
+unlock_error:
+ wc_unlock(wc);
+ bio_io_error(bio);
+ return DM_MAPIO_SUBMITTED;
+}
+
+static int writecache_end_io(struct dm_target *ti, struct bio *bio, blk_status_t *status)
+{
+ struct dm_writecache *wc = ti->private;
+
+ if (bio->bi_private != NULL) {
+ int dir = bio_data_dir(bio);
+ if (atomic_dec_and_test(&wc->bio_in_progress[dir]))
+ if (unlikely(waitqueue_active(&wc->bio_in_progress_wait[dir])))
+ wake_up(&wc->bio_in_progress_wait[dir]);
+ }
+ return 0;
+}
+
+static int writecache_iterate_devices(struct dm_target *ti,
+ iterate_devices_callout_fn fn, void *data)
+{
+ struct dm_writecache *wc = ti->private;
+
+ return fn(ti, wc->dev, 0, ti->len, data);
+}
+
+static void writecache_io_hints(struct dm_target *ti, struct queue_limits *limits)
+{
+ struct dm_writecache *wc = ti->private;
+
+ if (limits->logical_block_size < wc->block_size)
+ limits->logical_block_size = wc->block_size;
+
+ if (limits->physical_block_size < wc->block_size)
+ limits->physical_block_size = wc->block_size;
+
+ if (limits->io_min < wc->block_size)
+ limits->io_min = wc->block_size;
+}
+
+
+static void writecache_writeback_endio(struct bio *bio)
+{
+ struct writeback_struct *wb = container_of(bio, struct writeback_struct, bio);
+ struct dm_writecache *wc = wb->wc;
+ unsigned long flags;
+
+ raw_spin_lock_irqsave(&wc->endio_list_lock, flags);
+ if (unlikely(list_empty(&wc->endio_list)))
+ wake_up_process(wc->endio_thread);
+ list_add_tail(&wb->endio_entry, &wc->endio_list);
+ raw_spin_unlock_irqrestore(&wc->endio_list_lock, flags);
+}
+
+static void writecache_copy_endio(int read_err, unsigned long write_err, void *ptr)
+{
+ struct copy_struct *c = ptr;
+ struct dm_writecache *wc = c->wc;
+
+ c->error = likely(!(read_err | write_err)) ? 0 : -EIO;
+
+ raw_spin_lock_irq(&wc->endio_list_lock);
+ if (unlikely(list_empty(&wc->endio_list)))
+ wake_up_process(wc->endio_thread);
+ list_add_tail(&c->endio_entry, &wc->endio_list);
+ raw_spin_unlock_irq(&wc->endio_list_lock);
+}
+
+static void __writecache_endio_pmem(struct dm_writecache *wc, struct list_head *list)
+{
+ unsigned i;
+ struct writeback_struct *wb;
+ struct wc_entry *e;
+ unsigned long n_walked = 0;
+
+ do {
+ wb = list_entry(list->next, struct writeback_struct, endio_entry);
+ list_del(&wb->endio_entry);
+
+ if (unlikely(wb->bio.bi_status != BLK_STS_OK))
+ writecache_error(wc, blk_status_to_errno(wb->bio.bi_status),
+ "write error %d", wb->bio.bi_status);
+ i = 0;
+ do {
+ e = wb->wc_list[i];
+ BUG_ON(!e->write_in_progress);
+ e->write_in_progress = false;
+ INIT_LIST_HEAD(&e->lru);
+ if (!writecache_has_error(wc))
+ writecache_free_entry(wc, e);
+ BUG_ON(!wc->writeback_size);
+ wc->writeback_size--;
+ n_walked++;
+ if (unlikely(n_walked >= ENDIO_LATENCY)) {
+ writecache_commit_flushed(wc);
+ wc_unlock(wc);
+ wc_lock(wc);
+ n_walked = 0;
+ }
+ } while (++i < wb->wc_list_n);
+
+ if (wb->wc_list != wb->wc_list_inline)
+ kfree(wb->wc_list);
+ bio_put(&wb->bio);
+ } while (!list_empty(list));
+}
+
+static void __writecache_endio_ssd(struct dm_writecache *wc, struct list_head *list)
+{
+ struct copy_struct *c;
+ struct wc_entry *e;
+
+ do {
+ c = list_entry(list->next, struct copy_struct, endio_entry);
+ list_del(&c->endio_entry);
+
+ if (unlikely(c->error))
+ writecache_error(wc, c->error, "copy error");
+
+ e = c->e;
+ do {
+ BUG_ON(!e->write_in_progress);
+ e->write_in_progress = false;
+ INIT_LIST_HEAD(&e->lru);
+ if (!writecache_has_error(wc))
+ writecache_free_entry(wc, e);
+
+ BUG_ON(!wc->writeback_size);
+ wc->writeback_size--;
+ e++;
+ } while (--c->n_entries);
+ mempool_free(c, &wc->copy_pool);
+ } while (!list_empty(list));
+}
+
+static int writecache_endio_thread(void *data)
+{
+ struct dm_writecache *wc = data;
+
+ while (1) {
+ struct list_head list;
+
+ raw_spin_lock_irq(&wc->endio_list_lock);
+ if (!list_empty(&wc->endio_list))
+ goto pop_from_list;
+ set_current_state(TASK_INTERRUPTIBLE);
+ raw_spin_unlock_irq(&wc->endio_list_lock);
+
+ if (unlikely(kthread_should_stop())) {
+ set_current_state(TASK_RUNNING);
+ break;
+ }
+
+ schedule();
+
+ continue;
+
+pop_from_list:
+ list = wc->endio_list;
+ list.next->prev = list.prev->next = &list;
+ INIT_LIST_HEAD(&wc->endio_list);
+ raw_spin_unlock_irq(&wc->endio_list_lock);
+
+ if (!WC_MODE_FUA(wc))
+ writecache_disk_flush(wc, wc->dev);
+
+ wc_lock(wc);
+
+ if (WC_MODE_PMEM(wc)) {
+ __writecache_endio_pmem(wc, &list);
+ } else {
+ __writecache_endio_ssd(wc, &list);
+ writecache_wait_for_ios(wc, READ);
+ }
+
+ writecache_commit_flushed(wc);
+
+ wc_unlock(wc);
+ }
+
+ return 0;
+}
+
+static bool wc_add_block(struct writeback_struct *wb, struct wc_entry *e, gfp_t gfp)
+{
+ struct dm_writecache *wc = wb->wc;
+ unsigned block_size = wc->block_size;
+ void *address = memory_data(wc, e);
+
+ persistent_memory_flush_cache(address, block_size);
+ return bio_add_page(&wb->bio, persistent_memory_page(address),
+ block_size, persistent_memory_page_offset(address)) != 0;
+}
+
+struct writeback_list {
+ struct list_head list;
+ size_t size;
+};
+
+static void __writeback_throttle(struct dm_writecache *wc, struct writeback_list *wbl)
+{
+ if (unlikely(wc->max_writeback_jobs)) {
+ if (READ_ONCE(wc->writeback_size) - wbl->size >= wc->max_writeback_jobs) {
+ wc_lock(wc);
+ while (wc->writeback_size - wbl->size >= wc->max_writeback_jobs)
+ writecache_wait_on_freelist(wc);
+ wc_unlock(wc);
+ }
+ }
+ cond_resched();
+}
+
+static void __writecache_writeback_pmem(struct dm_writecache *wc, struct writeback_list *wbl)
+{
+ struct wc_entry *e, *f;
+ struct bio *bio;
+ struct writeback_struct *wb;
+ unsigned max_pages;
+
+ while (wbl->size) {
+ wbl->size--;
+ e = container_of(wbl->list.prev, struct wc_entry, lru);
+ list_del(&e->lru);
+
+ max_pages = e->wc_list_contiguous;
+
+ bio = bio_alloc_bioset(GFP_NOIO, max_pages, &wc->bio_set);
+ wb = container_of(bio, struct writeback_struct, bio);
+ wb->wc = wc;
+ wb->bio.bi_end_io = writecache_writeback_endio;
+ bio_set_dev(&wb->bio, wc->dev->bdev);
+ wb->bio.bi_iter.bi_sector = read_original_sector(wc, e);
+ wb->page_offset = PAGE_SIZE;
+ if (max_pages <= WB_LIST_INLINE ||
+ unlikely(!(wb->wc_list = kmalloc_array(max_pages, sizeof(struct wc_entry *),
+ GFP_NOIO | __GFP_NORETRY |
+ __GFP_NOMEMALLOC | __GFP_NOWARN)))) {
+ wb->wc_list = wb->wc_list_inline;
+ max_pages = WB_LIST_INLINE;
+ }
+
+ BUG_ON(!wc_add_block(wb, e, GFP_NOIO));
+
+ wb->wc_list[0] = e;
+ wb->wc_list_n = 1;
+
+ while (wbl->size && wb->wc_list_n < max_pages) {
+ f = container_of(wbl->list.prev, struct wc_entry, lru);
+ if (read_original_sector(wc, f) !=
+ read_original_sector(wc, e) + (wc->block_size >> SECTOR_SHIFT))
+ break;
+ if (!wc_add_block(wb, f, GFP_NOWAIT | __GFP_NOWARN))
+ break;
+ wbl->size--;
+ list_del(&f->lru);
+ wb->wc_list[wb->wc_list_n++] = f;
+ e = f;
+ }
+ bio_set_op_attrs(&wb->bio, REQ_OP_WRITE, WC_MODE_FUA(wc) * REQ_FUA);
+ if (writecache_has_error(wc)) {
+ bio->bi_status = BLK_STS_IOERR;
+ bio_endio(&wb->bio);
+ } else {
+ submit_bio(&wb->bio);
+ }
+
+ __writeback_throttle(wc, wbl);
+ }
+}
+
+static void __writecache_writeback_ssd(struct dm_writecache *wc, struct writeback_list *wbl)
+{
+ struct wc_entry *e, *f;
+ struct dm_io_region from, to;
+ struct copy_struct *c;
+
+ while (wbl->size) {
+ unsigned n_sectors;
+
+ wbl->size--;
+ e = container_of(wbl->list.prev, struct wc_entry, lru);
+ list_del(&e->lru);
+
+ n_sectors = e->wc_list_contiguous << (wc->block_size_bits - SECTOR_SHIFT);
+
+ from.bdev = wc->ssd_dev->bdev;
+ from.sector = cache_sector(wc, e);
+ from.count = n_sectors;
+ to.bdev = wc->dev->bdev;
+ to.sector = read_original_sector(wc, e);
+ to.count = n_sectors;
+
+ c = mempool_alloc(&wc->copy_pool, GFP_NOIO);
+ c->wc = wc;
+ c->e = e;
+ c->n_entries = e->wc_list_contiguous;
+
+ while ((n_sectors -= wc->block_size >> SECTOR_SHIFT)) {
+ wbl->size--;
+ f = container_of(wbl->list.prev, struct wc_entry, lru);
+ BUG_ON(f != e + 1);
+ list_del(&f->lru);
+ e = f;
+ }
+
+ dm_kcopyd_copy(wc->dm_kcopyd, &from, 1, &to, 0, writecache_copy_endio, c);
+
+ __writeback_throttle(wc, wbl);
+ }
+}
+
+static void writecache_writeback(struct work_struct *work)
+{
+ struct dm_writecache *wc = container_of(work, struct dm_writecache, writeback_work);
+ struct blk_plug plug;
+ struct wc_entry *e, *f, *g;
+ struct rb_node *node, *next_node;
+ struct list_head skipped;
+ struct writeback_list wbl;
+ unsigned long n_walked;
+
+ wc_lock(wc);
+restart:
+ if (writecache_has_error(wc)) {
+ wc_unlock(wc);
+ return;
+ }
+
+ if (unlikely(wc->writeback_all)) {
+ if (writecache_wait_for_writeback(wc))
+ goto restart;
+ }
+
+ if (wc->overwrote_committed) {
+ writecache_wait_for_ios(wc, WRITE);
+ }
+
+ n_walked = 0;
+ INIT_LIST_HEAD(&skipped);
+ INIT_LIST_HEAD(&wbl.list);
+ wbl.size = 0;
+ while (!list_empty(&wc->lru) &&
+ (wc->writeback_all ||
+ wc->freelist_size + wc->writeback_size <= wc->freelist_low_watermark)) {
+
+ n_walked++;
+ if (unlikely(n_walked > WRITEBACK_LATENCY) &&
+ likely(!wc->writeback_all) && likely(!dm_suspended(wc->ti))) {
+ queue_work(wc->writeback_wq, &wc->writeback_work);
+ break;
+ }
+
+ e = container_of(wc->lru.prev, struct wc_entry, lru);
+ BUG_ON(e->write_in_progress);
+ if (unlikely(!writecache_entry_is_committed(wc, e))) {
+ writecache_flush(wc);
+ }
+ node = rb_prev(&e->rb_node);
+ if (node) {
+ f = container_of(node, struct wc_entry, rb_node);
+ if (unlikely(read_original_sector(wc, f) ==
+ read_original_sector(wc, e))) {
+ BUG_ON(!f->write_in_progress);
+ list_del(&e->lru);
+ list_add(&e->lru, &skipped);
+ cond_resched();
+ continue;
+ }
+ }
+ wc->writeback_size++;
+ list_del(&e->lru);
+ list_add(&e->lru, &wbl.list);
+ wbl.size++;
+ e->write_in_progress = true;
+ e->wc_list_contiguous = 1;
+
+ f = e;
+
+ while (1) {
+ next_node = rb_next(&f->rb_node);
+ if (unlikely(!next_node))
+ break;
+ g = container_of(next_node, struct wc_entry, rb_node);
+ if (read_original_sector(wc, g) ==
+ read_original_sector(wc, f)) {
+ f = g;
+ continue;
+ }
+ if (read_original_sector(wc, g) !=
+ read_original_sector(wc, f) + (wc->block_size >> SECTOR_SHIFT))
+ break;
+ if (unlikely(g->write_in_progress))
+ break;
+ if (unlikely(!writecache_entry_is_committed(wc, g)))
+ break;
+
+ if (!WC_MODE_PMEM(wc)) {
+ if (g != f + 1)
+ break;
+ }
+
+ n_walked++;
+ //if (unlikely(n_walked > WRITEBACK_LATENCY) && likely(!wc->writeback_all))
+ // break;
+
+ wc->writeback_size++;
+ list_del(&g->lru);
+ list_add(&g->lru, &wbl.list);
+ wbl.size++;
+ g->write_in_progress = true;
+ g->wc_list_contiguous = BIO_MAX_PAGES;
+ f = g;
+ e->wc_list_contiguous++;
+ if (unlikely(e->wc_list_contiguous == BIO_MAX_PAGES))
+ break;
+ }
+ cond_resched();
+ }
+
+ if (!list_empty(&skipped)) {
+ list_splice_tail(&skipped, &wc->lru);
+ /*
+ * If we didn't do any progress, we must wait until some
+ * writeback finishes to avoid burning CPU in a loop
+ */
+ if (unlikely(!wbl.size))
+ writecache_wait_for_writeback(wc);
+ }
+
+ wc_unlock(wc);
+
+ blk_start_plug(&plug);
+
+ if (WC_MODE_PMEM(wc))
+ __writecache_writeback_pmem(wc, &wbl);
+ else
+ __writecache_writeback_ssd(wc, &wbl);
+
+ blk_finish_plug(&plug);
+
+ if (unlikely(wc->writeback_all)) {
+ wc_lock(wc);
+ while (writecache_wait_for_writeback(wc));
+ wc_unlock(wc);
+ }
+}
+
+static int calculate_memory_size(uint64_t device_size, unsigned block_size,
+ size_t *n_blocks_p, size_t *n_metadata_blocks_p)
+{
+ uint64_t n_blocks, offset;
+ struct wc_entry e;
+
+ n_blocks = device_size;
+ do_div(n_blocks, block_size + sizeof(struct wc_memory_entry));
+
+ while (1) {
+ if (!n_blocks)
+ return -ENOSPC;
+ /* Verify the following entries[n_blocks] won't overflow */
+ if (n_blocks >= ((size_t)-sizeof(struct wc_memory_superblock) /
+ sizeof(struct wc_memory_entry)))
+ return -EFBIG;
+ offset = offsetof(struct wc_memory_superblock, entries[n_blocks]);
+ offset = (offset + block_size - 1) & ~(uint64_t)(block_size - 1);
+ if (offset + n_blocks * block_size <= device_size)
+ break;
+ n_blocks--;
+ }
+
+ /* check if the bit field overflows */
+ e.index = n_blocks;
+ if (e.index != n_blocks)
+ return -EFBIG;
+
+ if (n_blocks_p)
+ *n_blocks_p = n_blocks;
+ if (n_metadata_blocks_p)
+ *n_metadata_blocks_p = offset >> __ffs(block_size);
+ return 0;
+}
+
+static int init_memory(struct dm_writecache *wc)
+{
+ size_t b;
+ int r;
+
+ r = calculate_memory_size(wc->memory_map_size, wc->block_size, &wc->n_blocks, NULL);
+ if (r)
+ return r;
+
+ r = writecache_alloc_entries(wc);
+ if (r)
+ return r;
+
+ for (b = 0; b < ARRAY_SIZE(sb(wc)->padding); b++)
+ pmem_assign(sb(wc)->padding[b], cpu_to_le64(0));
+ pmem_assign(sb(wc)->version, cpu_to_le32(MEMORY_SUPERBLOCK_VERSION));
+ pmem_assign(sb(wc)->block_size, cpu_to_le32(wc->block_size));
+ pmem_assign(sb(wc)->n_blocks, cpu_to_le64(wc->n_blocks));
+ pmem_assign(sb(wc)->seq_count, cpu_to_le64(0));
+
+ for (b = 0; b < wc->n_blocks; b++)
+ write_original_sector_seq_count(wc, &wc->entries[b], -1, -1);
+
+ writecache_flush_all_metadata(wc);
+ writecache_commit_flushed(wc);
+ pmem_assign(sb(wc)->magic, cpu_to_le32(MEMORY_SUPERBLOCK_MAGIC));
+ writecache_flush_region(wc, &sb(wc)->magic, sizeof sb(wc)->magic);
+ writecache_commit_flushed(wc);
+
+ return 0;
+}
+
+static void writecache_dtr(struct dm_target *ti)
+{
+ struct dm_writecache *wc = ti->private;
+
+ if (!wc)
+ return;
+
+ if (wc->endio_thread)
+ kthread_stop(wc->endio_thread);
+
+ if (wc->flush_thread)
+ kthread_stop(wc->flush_thread);
+
+ bioset_exit(&wc->bio_set);
+
+ mempool_exit(&wc->copy_pool);
+
+ if (wc->writeback_wq)
+ destroy_workqueue(wc->writeback_wq);
+
+ if (wc->dev)
+ dm_put_device(ti, wc->dev);
+
+ if (wc->ssd_dev)
+ dm_put_device(ti, wc->ssd_dev);
+
+ if (wc->entries)
+ vfree(wc->entries);
+
+ if (wc->memory_map) {
+ if (WC_MODE_PMEM(wc))
+ persistent_memory_release(wc);
+ else
+ vfree(wc->memory_map);
+ }
+
+ if (wc->dm_kcopyd)
+ dm_kcopyd_client_destroy(wc->dm_kcopyd);
+
+ if (wc->dm_io)
+ dm_io_client_destroy(wc->dm_io);
+
+ if (wc->dirty_bitmap)
+ vfree(wc->dirty_bitmap);
+
+ kfree(wc);
+}
+
+static int writecache_ctr(struct dm_target *ti, unsigned argc, char **argv)
+{
+ struct dm_writecache *wc;
+ struct dm_arg_set as;
+ const char *string;
+ unsigned opt_params;
+ size_t offset, data_size;
+ int i, r;
+ char dummy;
+ int high_wm_percent = HIGH_WATERMARK;
+ int low_wm_percent = LOW_WATERMARK;
+ uint64_t x;
+ struct wc_memory_superblock s;
+
+ static struct dm_arg _args[] = {
+ {0, 10, "Invalid number of feature args"},
+ };
+
+ as.argc = argc;
+ as.argv = argv;
+
+ wc = kzalloc(sizeof(struct dm_writecache), GFP_KERNEL);
+ if (!wc) {
+ ti->error = "Cannot allocate writecache structure";
+ r = -ENOMEM;
+ goto bad;
+ }
+ ti->private = wc;
+ wc->ti = ti;
+
+ mutex_init(&wc->lock);
+ writecache_poison_lists(wc);
+ init_waitqueue_head(&wc->freelist_wait);
+ timer_setup(&wc->autocommit_timer, writecache_autocommit_timer, 0);
+
+ for (i = 0; i < 2; i++) {
+ atomic_set(&wc->bio_in_progress[i], 0);
+ init_waitqueue_head(&wc->bio_in_progress_wait[i]);
+ }
+
+ wc->dm_io = dm_io_client_create();
+ if (IS_ERR(wc->dm_io)) {
+ r = PTR_ERR(wc->dm_io);
+ ti->error = "Unable to allocate dm-io client";
+ wc->dm_io = NULL;
+ goto bad;
+ }
+
+ wc->writeback_wq = alloc_workqueue("writecache-writeabck", WQ_MEM_RECLAIM, 1);
+ if (!wc->writeback_wq) {
+ r = -ENOMEM;
+ ti->error = "Could not allocate writeback workqueue";
+ goto bad;
+ }
+ INIT_WORK(&wc->writeback_work, writecache_writeback);
+ INIT_WORK(&wc->flush_work, writecache_flush_work);
+
+ raw_spin_lock_init(&wc->endio_list_lock);
+ INIT_LIST_HEAD(&wc->endio_list);
+ wc->endio_thread = kthread_create(writecache_endio_thread, wc, "writecache_endio");
+ if (IS_ERR(wc->endio_thread)) {
+ r = PTR_ERR(wc->endio_thread);
+ wc->endio_thread = NULL;
+ ti->error = "Couldn't spawn endio thread";
+ goto bad;
+ }
+ wake_up_process(wc->endio_thread);
+
+ /*
+ * Parse the mode (pmem or ssd)
+ */
+ string = dm_shift_arg(&as);
+ if (!string)
+ goto bad_arguments;
+
+ if (!strcasecmp(string, "s")) {
+ wc->pmem_mode = false;
+ } else if (!strcasecmp(string, "p")) {
+#ifdef DM_WRITECACHE_HAS_PMEM
+ wc->pmem_mode = true;
+ wc->writeback_fua = true;
+#else
+ /*
+ * If the architecture doesn't support persistent memory or
+ * the kernel doesn't support any DAX drivers, this driver can
+ * only be used in SSD-only mode.
+ */
+ r = -EOPNOTSUPP;
+ ti->error = "Persistent memory or DAX not supported on this system";
+ goto bad;
+#endif
+ } else {
+ goto bad_arguments;
+ }
+
+ if (WC_MODE_PMEM(wc)) {
+ r = bioset_init(&wc->bio_set, BIO_POOL_SIZE,
+ offsetof(struct writeback_struct, bio),
+ BIOSET_NEED_BVECS);
+ if (r) {
+ ti->error = "Could not allocate bio set";
+ goto bad;
+ }
+ } else {
+ r = mempool_init_kmalloc_pool(&wc->copy_pool, 1, sizeof(struct copy_struct));
+ if (r) {
+ ti->error = "Could not allocate mempool";
+ goto bad;
+ }
+ }
+
+ /*
+ * Parse the origin data device
+ */
+ string = dm_shift_arg(&as);
+ if (!string)
+ goto bad_arguments;
+ r = dm_get_device(ti, string, dm_table_get_mode(ti->table), &wc->dev);
+ if (r) {
+ ti->error = "Origin data device lookup failed";
+ goto bad;
+ }
+
+ /*
+ * Parse cache data device (be it pmem or ssd)
+ */
+ string = dm_shift_arg(&as);
+ if (!string)
+ goto bad_arguments;
+
+ r = dm_get_device(ti, string, dm_table_get_mode(ti->table), &wc->ssd_dev);
+ if (r) {
+ ti->error = "Cache data device lookup failed";
+ goto bad;
+ }
+ wc->memory_map_size = i_size_read(wc->ssd_dev->bdev->bd_inode);
+
+ if (WC_MODE_PMEM(wc)) {
+ r = persistent_memory_claim(wc);
+ if (r) {
+ ti->error = "Unable to map persistent memory for cache";
+ goto bad;
+ }
+ }
+
+ /*
+ * Parse the cache block size
+ */
+ string = dm_shift_arg(&as);
+ if (!string)
+ goto bad_arguments;
+ if (sscanf(string, "%u%c", &wc->block_size, &dummy) != 1 ||
+ wc->block_size < 512 || wc->block_size > PAGE_SIZE ||
+ (wc->block_size & (wc->block_size - 1))) {
+ r = -EINVAL;
+ ti->error = "Invalid block size";
+ goto bad;
+ }
+ wc->block_size_bits = __ffs(wc->block_size);
+
+ wc->max_writeback_jobs = MAX_WRITEBACK_JOBS;
+ wc->autocommit_blocks = !WC_MODE_PMEM(wc) ? AUTOCOMMIT_BLOCKS_SSD : AUTOCOMMIT_BLOCKS_PMEM;
+ wc->autocommit_jiffies = msecs_to_jiffies(AUTOCOMMIT_MSEC);
+
+ /*
+ * Parse optional arguments
+ */
+ r = dm_read_arg_group(_args, &as, &opt_params, &ti->error);
+ if (r)
+ goto bad;
+
+ while (opt_params) {
+ string = dm_shift_arg(&as), opt_params--;
+ if (!strcasecmp(string, "high_watermark") && opt_params >= 1) {
+ string = dm_shift_arg(&as), opt_params--;
+ if (sscanf(string, "%d%c", &high_wm_percent, &dummy) != 1)
+ goto invalid_optional;
+ if (high_wm_percent < 0 || high_wm_percent > 100)
+ goto invalid_optional;
+ wc->high_wm_percent_set = true;
+ } else if (!strcasecmp(string, "low_watermark") && opt_params >= 1) {
+ string = dm_shift_arg(&as), opt_params--;
+ if (sscanf(string, "%d%c", &low_wm_percent, &dummy) != 1)
+ goto invalid_optional;
+ if (low_wm_percent < 0 || low_wm_percent > 100)
+ goto invalid_optional;
+ wc->low_wm_percent_set = true;
+ } else if (!strcasecmp(string, "writeback_jobs") && opt_params >= 1) {
+ string = dm_shift_arg(&as), opt_params--;
+ if (sscanf(string, "%u%c", &wc->max_writeback_jobs, &dummy) != 1)
+ goto invalid_optional;
+ wc->max_writeback_jobs_set = true;
+ } else if (!strcasecmp(string, "autocommit_blocks") && opt_params >= 1) {
+ string = dm_shift_arg(&as), opt_params--;
+ if (sscanf(string, "%u%c", &wc->autocommit_blocks, &dummy) != 1)
+ goto invalid_optional;
+ wc->autocommit_blocks_set = true;
+ } else if (!strcasecmp(string, "autocommit_time") && opt_params >= 1) {
+ unsigned autocommit_msecs;
+ string = dm_shift_arg(&as), opt_params--;
+ if (sscanf(string, "%u%c", &autocommit_msecs, &dummy) != 1)
+ goto invalid_optional;
+ if (autocommit_msecs > 3600000)
+ goto invalid_optional;
+ wc->autocommit_jiffies = msecs_to_jiffies(autocommit_msecs);
+ wc->autocommit_time_set = true;
+ } else if (!strcasecmp(string, "fua")) {
+ if (WC_MODE_PMEM(wc)) {
+ wc->writeback_fua = true;
+ wc->writeback_fua_set = true;
+ } else goto invalid_optional;
+ } else if (!strcasecmp(string, "nofua")) {
+ if (WC_MODE_PMEM(wc)) {
+ wc->writeback_fua = false;
+ wc->writeback_fua_set = true;
+ } else goto invalid_optional;
+ } else {
+invalid_optional:
+ r = -EINVAL;
+ ti->error = "Invalid optional argument";
+ goto bad;
+ }
+ }
+
+ if (high_wm_percent < low_wm_percent) {
+ r = -EINVAL;
+ ti->error = "High watermark must be greater than or equal to low watermark";
+ goto bad;
+ }
+
+ if (!WC_MODE_PMEM(wc)) {
+ struct dm_io_region region;
+ struct dm_io_request req;
+ size_t n_blocks, n_metadata_blocks;
+ uint64_t n_bitmap_bits;
+
+ bio_list_init(&wc->flush_list);
+ wc->flush_thread = kthread_create(writecache_flush_thread, wc, "dm_writecache_flush");
+ if (IS_ERR(wc->flush_thread)) {
+ r = PTR_ERR(wc->flush_thread);
+ wc->flush_thread = NULL;
+ ti->error = "Couldn't spawn endio thread";
+ goto bad;
+ }
+ wake_up_process(wc->flush_thread);
+
+ r = calculate_memory_size(wc->memory_map_size, wc->block_size,
+ &n_blocks, &n_metadata_blocks);
+ if (r) {
+ ti->error = "Invalid device size";
+ goto bad;
+ }
+
+ n_bitmap_bits = (((uint64_t)n_metadata_blocks << wc->block_size_bits) +
+ BITMAP_GRANULARITY - 1) / BITMAP_GRANULARITY;
+ /* this is limitation of test_bit functions */
+ if (n_bitmap_bits > 1U << 31) {
+ r = -EFBIG;
+ ti->error = "Invalid device size";
+ goto bad;
+ }
+
+ wc->memory_map = vmalloc(n_metadata_blocks << wc->block_size_bits);
+ if (!wc->memory_map) {
+ r = -ENOMEM;
+ ti->error = "Unable to allocate memory for metadata";
+ goto bad;
+ }
+
+ wc->dm_kcopyd = dm_kcopyd_client_create(&dm_kcopyd_throttle);
+ if (IS_ERR(wc->dm_kcopyd)) {
+ r = PTR_ERR(wc->dm_kcopyd);
+ ti->error = "Unable to allocate dm-kcopyd client";
+ wc->dm_kcopyd = NULL;
+ goto bad;
+ }
+
+ wc->metadata_sectors = n_metadata_blocks << (wc->block_size_bits - SECTOR_SHIFT);
+ wc->dirty_bitmap_size = (n_bitmap_bits + BITS_PER_LONG - 1) /
+ BITS_PER_LONG * sizeof(unsigned long);
+ wc->dirty_bitmap = vzalloc(wc->dirty_bitmap_size);
+ if (!wc->dirty_bitmap) {
+ r = -ENOMEM;
+ ti->error = "Unable to allocate dirty bitmap";
+ goto bad;
+ }
+
+ region.bdev = wc->ssd_dev->bdev;
+ region.sector = 0;
+ region.count = wc->metadata_sectors;
+ req.bi_op = REQ_OP_READ;
+ req.bi_op_flags = REQ_SYNC;
+ req.mem.type = DM_IO_VMA;
+ req.mem.ptr.vma = (char *)wc->memory_map;
+ req.client = wc->dm_io;
+ req.notify.fn = NULL;
+
+ r = dm_io(&req, 1, &region, NULL);
+ if (r) {
+ ti->error = "Unable to read metadata";
+ goto bad;
+ }
+ }
+
+ r = memcpy_mcsafe(&s, sb(wc), sizeof(struct wc_memory_superblock));
+ if (r) {
+ ti->error = "Hardware memory error when reading superblock";
+ goto bad;
+ }
+ if (!le32_to_cpu(s.magic) && !le32_to_cpu(s.version)) {
+ r = init_memory(wc);
+ if (r) {
+ ti->error = "Unable to initialize device";
+ goto bad;
+ }
+ r = memcpy_mcsafe(&s, sb(wc), sizeof(struct wc_memory_superblock));
+ if (r) {
+ ti->error = "Hardware memory error when reading superblock";
+ goto bad;
+ }
+ }
+
+ if (le32_to_cpu(s.magic) != MEMORY_SUPERBLOCK_MAGIC) {
+ ti->error = "Invalid magic in the superblock";
+ r = -EINVAL;
+ goto bad;
+ }
+
+ if (le32_to_cpu(s.version) != MEMORY_SUPERBLOCK_VERSION) {
+ ti->error = "Invalid version in the superblock";
+ r = -EINVAL;
+ goto bad;
+ }
+
+ if (le32_to_cpu(s.block_size) != wc->block_size) {
+ ti->error = "Block size does not match superblock";
+ r = -EINVAL;
+ goto bad;
+ }
+
+ wc->n_blocks = le64_to_cpu(s.n_blocks);
+
+ offset = wc->n_blocks * sizeof(struct wc_memory_entry);
+ if (offset / sizeof(struct wc_memory_entry) != le64_to_cpu(sb(wc)->n_blocks)) {
+overflow:
+ ti->error = "Overflow in size calculation";
+ r = -EINVAL;
+ goto bad;
+ }
+ offset += sizeof(struct wc_memory_superblock);
+ if (offset < sizeof(struct wc_memory_superblock))
+ goto overflow;
+ offset = (offset + wc->block_size - 1) & ~(size_t)(wc->block_size - 1);
+ data_size = wc->n_blocks * (size_t)wc->block_size;
+ if (!offset || (data_size / wc->block_size != wc->n_blocks) ||
+ (offset + data_size < offset))
+ goto overflow;
+ if (offset + data_size > wc->memory_map_size) {
+ ti->error = "Memory area is too small";
+ r = -EINVAL;
+ goto bad;
+ }
+
+ wc->metadata_sectors = offset >> SECTOR_SHIFT;
+ wc->block_start = (char *)sb(wc) + offset;
+
+ x = (uint64_t)wc->n_blocks * (100 - high_wm_percent);
+ x += 50;
+ do_div(x, 100);
+ wc->freelist_high_watermark = x;
+ x = (uint64_t)wc->n_blocks * (100 - low_wm_percent);
+ x += 50;
+ do_div(x, 100);
+ wc->freelist_low_watermark = x;
+
+ r = writecache_alloc_entries(wc);
+ if (r) {
+ ti->error = "Cannot allocate memory";
+ goto bad;
+ }
+
+ ti->num_flush_bios = 1;
+ ti->flush_supported = true;
+ ti->num_discard_bios = 1;
+
+ if (WC_MODE_PMEM(wc))
+ persistent_memory_flush_cache(wc->memory_map, wc->memory_map_size);
+
+ return 0;
+
+bad_arguments:
+ r = -EINVAL;
+ ti->error = "Bad arguments";
+bad:
+ writecache_dtr(ti);
+ return r;
+}
+
+static void writecache_status(struct dm_target *ti, status_type_t type,
+ unsigned status_flags, char *result, unsigned maxlen)
+{
+ struct dm_writecache *wc = ti->private;
+ unsigned extra_args;
+ unsigned sz = 0;
+ uint64_t x;
+
+ switch (type) {
+ case STATUSTYPE_INFO:
+ DMEMIT("%ld %llu %llu %llu", writecache_has_error(wc),
+ (unsigned long long)wc->n_blocks, (unsigned long long)wc->freelist_size,
+ (unsigned long long)wc->writeback_size);
+ break;
+ case STATUSTYPE_TABLE:
+ DMEMIT("%c %s %s %u ", WC_MODE_PMEM(wc) ? 'p' : 's',
+ wc->dev->name, wc->ssd_dev->name, wc->block_size);
+ extra_args = 0;
+ if (wc->high_wm_percent_set)
+ extra_args += 2;
+ if (wc->low_wm_percent_set)
+ extra_args += 2;
+ if (wc->max_writeback_jobs_set)
+ extra_args += 2;
+ if (wc->autocommit_blocks_set)
+ extra_args += 2;
+ if (wc->autocommit_time_set)
+ extra_args += 2;
+ if (wc->writeback_fua_set)
+ extra_args++;
+
+ DMEMIT("%u", extra_args);
+ if (wc->high_wm_percent_set) {
+ x = (uint64_t)wc->freelist_high_watermark * 100;
+ x += wc->n_blocks / 2;
+ do_div(x, (size_t)wc->n_blocks);
+ DMEMIT(" high_watermark %u", 100 - (unsigned)x);
+ }
+ if (wc->low_wm_percent_set) {
+ x = (uint64_t)wc->freelist_low_watermark * 100;
+ x += wc->n_blocks / 2;
+ do_div(x, (size_t)wc->n_blocks);
+ DMEMIT(" low_watermark %u", 100 - (unsigned)x);
+ }
+ if (wc->max_writeback_jobs_set)
+ DMEMIT(" writeback_jobs %u", wc->max_writeback_jobs);
+ if (wc->autocommit_blocks_set)
+ DMEMIT(" autocommit_blocks %u", wc->autocommit_blocks);
+ if (wc->autocommit_time_set)
+ DMEMIT(" autocommit_time %u", jiffies_to_msecs(wc->autocommit_jiffies));
+ if (wc->writeback_fua_set)
+ DMEMIT(" %sfua", wc->writeback_fua ? "" : "no");
+ break;
+ }
+}
+
+static struct target_type writecache_target = {
+ .name = "writecache",
+ .version = {1, 0, 0},
+ .module = THIS_MODULE,
+ .ctr = writecache_ctr,
+ .dtr = writecache_dtr,
+ .status = writecache_status,
+ .postsuspend = writecache_suspend,
+ .resume = writecache_resume,
+ .message = writecache_message,
+ .map = writecache_map,
+ .end_io = writecache_end_io,
+ .iterate_devices = writecache_iterate_devices,
+ .io_hints = writecache_io_hints,
+};
+
+static int __init dm_writecache_init(void)
+{
+ int r;
+
+ r = dm_register_target(&writecache_target);
+ if (r < 0) {
+ DMERR("register failed %d", r);
+ return r;
+ }
+
+ return 0;
+}
+
+static void __exit dm_writecache_exit(void)
+{
+ dm_unregister_target(&writecache_target);
+}
+
+module_init(dm_writecache_init);
+module_exit(dm_writecache_exit);
+
+MODULE_DESCRIPTION(DM_NAME " writecache target");
+MODULE_AUTHOR("Mikulas Patocka <[email protected]>");
+MODULE_LICENSE("GPL");
diff --git a/drivers/md/dm-zoned-target.c b/drivers/md/dm-zoned-target.c
index e73b0776683c..a44183ff4be0 100644
--- a/drivers/md/dm-zoned-target.c
+++ b/drivers/md/dm-zoned-target.c
@@ -52,12 +52,12 @@ struct dmz_target {
struct dmz_reclaim *reclaim;
/* For chunk work */
- struct mutex chunk_lock;
struct radix_tree_root chunk_rxtree;
struct workqueue_struct *chunk_wq;
+ struct mutex chunk_lock;
/* For cloned BIOs to zones */
- struct bio_set *bio_set;
+ struct bio_set bio_set;
/* For flush */
spinlock_t flush_lock;
@@ -121,7 +121,7 @@ static int dmz_submit_read_bio(struct dmz_target *dmz, struct dm_zone *zone,
}
/* Partial BIO: we need to clone the BIO */
- clone = bio_clone_fast(bio, GFP_NOIO, dmz->bio_set);
+ clone = bio_clone_fast(bio, GFP_NOIO, &dmz->bio_set);
if (!clone)
return -ENOMEM;
@@ -779,16 +779,15 @@ static int dmz_ctr(struct dm_target *ti, unsigned int argc, char **argv)
ti->len = (sector_t)dmz_nr_chunks(dmz->metadata) << dev->zone_nr_sectors_shift;
/* Zone BIO */
- dmz->bio_set = bioset_create(DMZ_MIN_BIOS, 0, 0);
- if (!dmz->bio_set) {
+ ret = bioset_init(&dmz->bio_set, DMZ_MIN_BIOS, 0, 0);
+ if (ret) {
ti->error = "Create BIO set failed";
- ret = -ENOMEM;
goto err_meta;
}
/* Chunk BIO work */
mutex_init(&dmz->chunk_lock);
- INIT_RADIX_TREE(&dmz->chunk_rxtree, GFP_KERNEL);
+ INIT_RADIX_TREE(&dmz->chunk_rxtree, GFP_NOIO);
dmz->chunk_wq = alloc_workqueue("dmz_cwq_%s", WQ_MEM_RECLAIM | WQ_UNBOUND,
0, dev->name);
if (!dmz->chunk_wq) {
@@ -828,7 +827,7 @@ err_cwq:
destroy_workqueue(dmz->chunk_wq);
err_bio:
mutex_destroy(&dmz->chunk_lock);
- bioset_free(dmz->bio_set);
+ bioset_exit(&dmz->bio_set);
err_meta:
dmz_dtr_metadata(dmz->metadata);
err_dev:
@@ -858,7 +857,7 @@ static void dmz_dtr(struct dm_target *ti)
dmz_dtr_metadata(dmz->metadata);
- bioset_free(dmz->bio_set);
+ bioset_exit(&dmz->bio_set);
dmz_put_zoned_device(ti);
diff --git a/drivers/md/dm.c b/drivers/md/dm.c
index 4ea404dbcf0b..b0dd7027848b 100644
--- a/drivers/md/dm.c
+++ b/drivers/md/dm.c
@@ -148,8 +148,8 @@ static int dm_numa_node = DM_NUMA_NODE;
* For mempools pre-allocation at the table loading time.
*/
struct dm_md_mempools {
- struct bio_set *bs;
- struct bio_set *io_bs;
+ struct bio_set bs;
+ struct bio_set io_bs;
};
struct table_device {
@@ -537,7 +537,7 @@ static struct dm_io *alloc_io(struct mapped_device *md, struct bio *bio)
struct dm_target_io *tio;
struct bio *clone;
- clone = bio_alloc_bioset(GFP_NOIO, 0, md->io_bs);
+ clone = bio_alloc_bioset(GFP_NOIO, 0, &md->io_bs);
if (!clone)
return NULL;
@@ -572,7 +572,7 @@ static struct dm_target_io *alloc_tio(struct clone_info *ci, struct dm_target *t
/* the dm_target_io embedded in ci->io is available */
tio = &ci->io->tio;
} else {
- struct bio *clone = bio_alloc_bioset(gfp_mask, 0, ci->io->md->bs);
+ struct bio *clone = bio_alloc_bioset(gfp_mask, 0, &ci->io->md->bs);
if (!clone)
return NULL;
@@ -1020,7 +1020,8 @@ int dm_set_target_max_io_len(struct dm_target *ti, sector_t len)
EXPORT_SYMBOL_GPL(dm_set_target_max_io_len);
static struct dm_target *dm_dax_get_live_target(struct mapped_device *md,
- sector_t sector, int *srcu_idx)
+ sector_t sector, int *srcu_idx)
+ __acquires(md->io_barrier)
{
struct dm_table *map;
struct dm_target *ti;
@@ -1037,7 +1038,7 @@ static struct dm_target *dm_dax_get_live_target(struct mapped_device *md,
}
static long dm_dax_direct_access(struct dax_device *dax_dev, pgoff_t pgoff,
- long nr_pages, void **kaddr, pfn_t *pfn)
+ long nr_pages, void **kaddr, pfn_t *pfn)
{
struct mapped_device *md = dax_get_private(dax_dev);
sector_t sector = pgoff * PAGE_SECTORS;
@@ -1055,8 +1056,7 @@ static long dm_dax_direct_access(struct dax_device *dax_dev, pgoff_t pgoff,
if (len < 1)
goto out;
nr_pages = min(len, nr_pages);
- if (ti->type->direct_access)
- ret = ti->type->direct_access(ti, pgoff, nr_pages, kaddr, pfn);
+ ret = ti->type->direct_access(ti, pgoff, nr_pages, kaddr, pfn);
out:
dm_put_live_table(md, srcu_idx);
@@ -1065,7 +1065,7 @@ static long dm_dax_direct_access(struct dax_device *dax_dev, pgoff_t pgoff,
}
static size_t dm_dax_copy_from_iter(struct dax_device *dax_dev, pgoff_t pgoff,
- void *addr, size_t bytes, struct iov_iter *i)
+ void *addr, size_t bytes, struct iov_iter *i)
{
struct mapped_device *md = dax_get_private(dax_dev);
sector_t sector = pgoff * PAGE_SECTORS;
@@ -1088,6 +1088,30 @@ static size_t dm_dax_copy_from_iter(struct dax_device *dax_dev, pgoff_t pgoff,
return ret;
}
+static size_t dm_dax_copy_to_iter(struct dax_device *dax_dev, pgoff_t pgoff,
+ void *addr, size_t bytes, struct iov_iter *i)
+{
+ struct mapped_device *md = dax_get_private(dax_dev);
+ sector_t sector = pgoff * PAGE_SECTORS;
+ struct dm_target *ti;
+ long ret = 0;
+ int srcu_idx;
+
+ ti = dm_dax_get_live_target(md, sector, &srcu_idx);
+
+ if (!ti)
+ goto out;
+ if (!ti->type->dax_copy_to_iter) {
+ ret = copy_to_iter(addr, bytes, i);
+ goto out;
+ }
+ ret = ti->type->dax_copy_to_iter(ti, pgoff, addr, bytes, i);
+ out:
+ dm_put_live_table(md, srcu_idx);
+
+ return ret;
+}
+
/*
* A target may call dm_accept_partial_bio only from the map routine. It is
* allowed for all bio types except REQ_PREFLUSH and REQ_OP_ZONE_RESET.
@@ -1581,10 +1605,9 @@ static blk_qc_t __split_and_process_bio(struct mapped_device *md,
* the usage of io->orig_bio in dm_remap_zone_report()
* won't be affected by this reassignment.
*/
- struct bio *b = bio_clone_bioset(bio, GFP_NOIO,
- md->queue->bio_split);
+ struct bio *b = bio_split(bio, bio_sectors(bio) - ci.sector_count,
+ GFP_NOIO, &md->queue->bio_split);
ci.io->orig_bio = b;
- bio_advance(bio, (bio_sectors(bio) - ci.sector_count) << 9);
bio_chain(b, bio);
ret = generic_make_request(bio);
break;
@@ -1784,10 +1807,8 @@ static void cleanup_mapped_device(struct mapped_device *md)
destroy_workqueue(md->wq);
if (md->kworker_task)
kthread_stop(md->kworker_task);
- if (md->bs)
- bioset_free(md->bs);
- if (md->io_bs)
- bioset_free(md->io_bs);
+ bioset_exit(&md->bs);
+ bioset_exit(&md->io_bs);
if (md->dax_dev) {
kill_dax(md->dax_dev);
@@ -1954,9 +1975,10 @@ static void free_dev(struct mapped_device *md)
kvfree(md);
}
-static void __bind_mempools(struct mapped_device *md, struct dm_table *t)
+static int __bind_mempools(struct mapped_device *md, struct dm_table *t)
{
struct dm_md_mempools *p = dm_table_get_md_mempools(t);
+ int ret = 0;
if (dm_table_bio_based(t)) {
/*
@@ -1964,16 +1986,10 @@ static void __bind_mempools(struct mapped_device *md, struct dm_table *t)
* If so, reload bioset because front_pad may have changed
* because a different table was loaded.
*/
- if (md->bs) {
- bioset_free(md->bs);
- md->bs = NULL;
- }
- if (md->io_bs) {
- bioset_free(md->io_bs);
- md->io_bs = NULL;
- }
+ bioset_exit(&md->bs);
+ bioset_exit(&md->io_bs);
- } else if (md->bs) {
+ } else if (bioset_initialized(&md->bs)) {
/*
* There's no need to reload with request-based dm
* because the size of front_pad doesn't change.
@@ -1985,15 +2001,20 @@ static void __bind_mempools(struct mapped_device *md, struct dm_table *t)
goto out;
}
- BUG_ON(!p || md->bs || md->io_bs);
+ BUG_ON(!p ||
+ bioset_initialized(&md->bs) ||
+ bioset_initialized(&md->io_bs));
- md->bs = p->bs;
- p->bs = NULL;
- md->io_bs = p->io_bs;
- p->io_bs = NULL;
+ ret = bioset_init_from_src(&md->bs, &p->bs);
+ if (ret)
+ goto out;
+ ret = bioset_init_from_src(&md->io_bs, &p->io_bs);
+ if (ret)
+ bioset_exit(&md->bs);
out:
/* mempool bind completed, no longer need any mempools in the table */
dm_table_free_md_mempools(t);
+ return ret;
}
/*
@@ -2038,6 +2059,7 @@ static struct dm_table *__bind(struct mapped_device *md, struct dm_table *t,
struct request_queue *q = md->queue;
bool request_based = dm_table_request_based(t);
sector_t size;
+ int ret;
lockdep_assert_held(&md->suspend_lock);
@@ -2073,7 +2095,11 @@ static struct dm_table *__bind(struct mapped_device *md, struct dm_table *t,
md->immutable_target = dm_table_get_immutable_target(t);
}
- __bind_mempools(md, t);
+ ret = __bind_mempools(md, t);
+ if (ret) {
+ old_map = ERR_PTR(ret);
+ goto out;
+ }
old_map = rcu_dereference_protected(md->map, lockdep_is_held(&md->suspend_lock));
rcu_assign_pointer(md->map, (void *)t);
@@ -2083,6 +2109,7 @@ static struct dm_table *__bind(struct mapped_device *md, struct dm_table *t,
if (old_map)
dm_sync_table(md);
+out:
return old_map;
}
@@ -2904,6 +2931,7 @@ struct dm_md_mempools *dm_alloc_md_mempools(struct mapped_device *md, enum dm_qu
struct dm_md_mempools *pools = kzalloc_node(sizeof(*pools), GFP_KERNEL, md->numa_node_id);
unsigned int pool_size = 0;
unsigned int front_pad, io_front_pad;
+ int ret;
if (!pools)
return NULL;
@@ -2915,10 +2943,10 @@ struct dm_md_mempools *dm_alloc_md_mempools(struct mapped_device *md, enum dm_qu
pool_size = max(dm_get_reserved_bio_based_ios(), min_pool_size);
front_pad = roundup(per_io_data_size, __alignof__(struct dm_target_io)) + offsetof(struct dm_target_io, clone);
io_front_pad = roundup(front_pad, __alignof__(struct dm_io)) + offsetof(struct dm_io, tio);
- pools->io_bs = bioset_create(pool_size, io_front_pad, 0);
- if (!pools->io_bs)
+ ret = bioset_init(&pools->io_bs, pool_size, io_front_pad, 0);
+ if (ret)
goto out;
- if (integrity && bioset_integrity_create(pools->io_bs, pool_size))
+ if (integrity && bioset_integrity_create(&pools->io_bs, pool_size))
goto out;
break;
case DM_TYPE_REQUEST_BASED:
@@ -2931,11 +2959,11 @@ struct dm_md_mempools *dm_alloc_md_mempools(struct mapped_device *md, enum dm_qu
BUG();
}
- pools->bs = bioset_create(pool_size, front_pad, 0);
- if (!pools->bs)
+ ret = bioset_init(&pools->bs, pool_size, front_pad, 0);
+ if (ret)
goto out;
- if (integrity && bioset_integrity_create(pools->bs, pool_size))
+ if (integrity && bioset_integrity_create(&pools->bs, pool_size))
goto out;
return pools;
@@ -2951,10 +2979,8 @@ void dm_free_md_mempools(struct dm_md_mempools *pools)
if (!pools)
return;
- if (pools->bs)
- bioset_free(pools->bs);
- if (pools->io_bs)
- bioset_free(pools->io_bs);
+ bioset_exit(&pools->bs);
+ bioset_exit(&pools->io_bs);
kfree(pools);
}
@@ -3133,6 +3159,7 @@ static const struct block_device_operations dm_blk_dops = {
static const struct dax_operations dm_dax_ops = {
.direct_access = dm_dax_direct_access,
.copy_from_iter = dm_dax_copy_from_iter,
+ .copy_to_iter = dm_dax_copy_to_iter,
};
/*
diff --git a/drivers/md/md-bitmap.c b/drivers/md/md-bitmap.c
index 239c7bb3929b..f983c3fdf204 100644
--- a/drivers/md/md-bitmap.c
+++ b/drivers/md/md-bitmap.c
@@ -789,8 +789,8 @@ static int bitmap_storage_alloc(struct bitmap_storage *store,
num_pages = DIV_ROUND_UP(bytes, PAGE_SIZE);
offset = slot_number * num_pages;
- store->filemap = kmalloc(sizeof(struct page *)
- * num_pages, GFP_KERNEL);
+ store->filemap = kmalloc_array(num_pages, sizeof(struct page *),
+ GFP_KERNEL);
if (!store->filemap)
return -ENOMEM;
@@ -2117,7 +2117,7 @@ int bitmap_resize(struct bitmap *bitmap, sector_t blocks,
pages = DIV_ROUND_UP(chunks, PAGE_COUNTER_RATIO);
- new_bp = kzalloc(pages * sizeof(*new_bp), GFP_KERNEL);
+ new_bp = kcalloc(pages, sizeof(*new_bp), GFP_KERNEL);
ret = -ENOMEM;
if (!new_bp) {
bitmap_file_unmap(&store);
diff --git a/drivers/md/md-cluster.c b/drivers/md/md-cluster.c
index 79bfbc840385..021cbf9ef1bf 100644
--- a/drivers/md/md-cluster.c
+++ b/drivers/md/md-cluster.c
@@ -1380,9 +1380,9 @@ static int lock_all_bitmaps(struct mddev *mddev)
char str[64];
struct md_cluster_info *cinfo = mddev->cluster_info;
- cinfo->other_bitmap_lockres = kzalloc((mddev->bitmap_info.nodes - 1) *
- sizeof(struct dlm_lock_resource *),
- GFP_KERNEL);
+ cinfo->other_bitmap_lockres =
+ kcalloc(mddev->bitmap_info.nodes - 1,
+ sizeof(struct dlm_lock_resource *), GFP_KERNEL);
if (!cinfo->other_bitmap_lockres) {
pr_err("md: can't alloc mem for other bitmap locks\n");
return 0;
diff --git a/drivers/md/md-faulty.c b/drivers/md/md-faulty.c
index 38264b38420f..c2fdf899de14 100644
--- a/drivers/md/md-faulty.c
+++ b/drivers/md/md-faulty.c
@@ -214,7 +214,7 @@ static bool faulty_make_request(struct mddev *mddev, struct bio *bio)
}
}
if (failit) {
- struct bio *b = bio_clone_fast(bio, GFP_NOIO, mddev->bio_set);
+ struct bio *b = bio_clone_fast(bio, GFP_NOIO, &mddev->bio_set);
bio_set_dev(b, conf->rdev->bdev);
b->bi_private = bio;
diff --git a/drivers/md/md-linear.c b/drivers/md/md-linear.c
index 4964323d936b..d45c697c0ebe 100644
--- a/drivers/md/md-linear.c
+++ b/drivers/md/md-linear.c
@@ -269,7 +269,7 @@ static bool linear_make_request(struct mddev *mddev, struct bio *bio)
if (unlikely(bio_end_sector(bio) > end_sector)) {
/* This bio crosses a device boundary, so we have to split it */
struct bio *split = bio_split(bio, end_sector - bio_sector,
- GFP_NOIO, mddev->bio_set);
+ GFP_NOIO, &mddev->bio_set);
bio_chain(split, bio);
generic_make_request(bio);
bio = split;
diff --git a/drivers/md/md-multipath.c b/drivers/md/md-multipath.c
index 0a7e99d62c69..881487de1e25 100644
--- a/drivers/md/md-multipath.c
+++ b/drivers/md/md-multipath.c
@@ -80,7 +80,7 @@ static void multipath_end_bh_io(struct multipath_bh *mp_bh, blk_status_t status)
bio->bi_status = status;
bio_endio(bio);
- mempool_free(mp_bh, conf->pool);
+ mempool_free(mp_bh, &conf->pool);
}
static void multipath_end_request(struct bio *bio)
@@ -117,7 +117,7 @@ static bool multipath_make_request(struct mddev *mddev, struct bio * bio)
return true;
}
- mp_bh = mempool_alloc(conf->pool, GFP_NOIO);
+ mp_bh = mempool_alloc(&conf->pool, GFP_NOIO);
mp_bh->master_bio = bio;
mp_bh->mddev = mddev;
@@ -125,7 +125,7 @@ static bool multipath_make_request(struct mddev *mddev, struct bio * bio)
mp_bh->path = multipath_map(conf);
if (mp_bh->path < 0) {
bio_io_error(bio);
- mempool_free(mp_bh, conf->pool);
+ mempool_free(mp_bh, &conf->pool);
return true;
}
multipath = conf->multipaths + mp_bh->path;
@@ -378,6 +378,7 @@ static int multipath_run (struct mddev *mddev)
struct multipath_info *disk;
struct md_rdev *rdev;
int working_disks;
+ int ret;
if (md_check_no_bitmap(mddev))
return -EINVAL;
@@ -398,7 +399,8 @@ static int multipath_run (struct mddev *mddev)
if (!conf)
goto out;
- conf->multipaths = kzalloc(sizeof(struct multipath_info)*mddev->raid_disks,
+ conf->multipaths = kcalloc(mddev->raid_disks,
+ sizeof(struct multipath_info),
GFP_KERNEL);
if (!conf->multipaths)
goto out_free_conf;
@@ -431,9 +433,9 @@ static int multipath_run (struct mddev *mddev)
}
mddev->degraded = conf->raid_disks - working_disks;
- conf->pool = mempool_create_kmalloc_pool(NR_RESERVED_BUFS,
- sizeof(struct multipath_bh));
- if (conf->pool == NULL)
+ ret = mempool_init_kmalloc_pool(&conf->pool, NR_RESERVED_BUFS,
+ sizeof(struct multipath_bh));
+ if (ret)
goto out_free_conf;
mddev->thread = md_register_thread(multipathd, mddev,
@@ -455,7 +457,7 @@ static int multipath_run (struct mddev *mddev)
return 0;
out_free_conf:
- mempool_destroy(conf->pool);
+ mempool_exit(&conf->pool);
kfree(conf->multipaths);
kfree(conf);
mddev->private = NULL;
@@ -467,7 +469,7 @@ static void multipath_free(struct mddev *mddev, void *priv)
{
struct mpconf *conf = priv;
- mempool_destroy(conf->pool);
+ mempool_exit(&conf->pool);
kfree(conf->multipaths);
kfree(conf);
}
diff --git a/drivers/md/md-multipath.h b/drivers/md/md-multipath.h
index 0adb941f485a..b3099e5fc4d7 100644
--- a/drivers/md/md-multipath.h
+++ b/drivers/md/md-multipath.h
@@ -13,7 +13,7 @@ struct mpconf {
spinlock_t device_lock;
struct list_head retry_list;
- mempool_t *pool;
+ mempool_t pool;
};
/*
diff --git a/drivers/md/md.c b/drivers/md/md.c
index c208c01f63a5..29b0cd9ec951 100644
--- a/drivers/md/md.c
+++ b/drivers/md/md.c
@@ -84,6 +84,8 @@ static void autostart_arrays(int part);
static LIST_HEAD(pers_list);
static DEFINE_SPINLOCK(pers_lock);
+static struct kobj_type md_ktype;
+
struct md_cluster_operations *md_cluster_ops;
EXPORT_SYMBOL(md_cluster_ops);
struct module *md_cluster_mod;
@@ -130,6 +132,24 @@ static inline int speed_max(struct mddev *mddev)
mddev->sync_speed_max : sysctl_speed_limit_max;
}
+static void * flush_info_alloc(gfp_t gfp_flags, void *data)
+{
+ return kzalloc(sizeof(struct flush_info), gfp_flags);
+}
+static void flush_info_free(void *flush_info, void *data)
+{
+ kfree(flush_info);
+}
+
+static void * flush_bio_alloc(gfp_t gfp_flags, void *data)
+{
+ return kzalloc(sizeof(struct flush_bio), gfp_flags);
+}
+static void flush_bio_free(void *flush_bio, void *data)
+{
+ kfree(flush_bio);
+}
+
static struct ctl_table_header *raid_table_header;
static struct ctl_table raid_table[] = {
@@ -193,10 +213,10 @@ struct bio *bio_alloc_mddev(gfp_t gfp_mask, int nr_iovecs,
{
struct bio *b;
- if (!mddev || !mddev->bio_set)
+ if (!mddev || !bioset_initialized(&mddev->bio_set))
return bio_alloc(gfp_mask, nr_iovecs);
- b = bio_alloc_bioset(gfp_mask, nr_iovecs, mddev->bio_set);
+ b = bio_alloc_bioset(gfp_mask, nr_iovecs, &mddev->bio_set);
if (!b)
return NULL;
return b;
@@ -205,10 +225,10 @@ EXPORT_SYMBOL_GPL(bio_alloc_mddev);
static struct bio *md_bio_alloc_sync(struct mddev *mddev)
{
- if (!mddev || !mddev->sync_set)
+ if (!mddev || !bioset_initialized(&mddev->sync_set))
return bio_alloc(GFP_NOIO, 1);
- return bio_alloc_bioset(GFP_NOIO, 1, mddev->sync_set);
+ return bio_alloc_bioset(GFP_NOIO, 1, &mddev->sync_set);
}
/*
@@ -412,30 +432,53 @@ static int md_congested(void *data, int bits)
/*
* Generic flush handling for md
*/
+static void submit_flushes(struct work_struct *ws)
+{
+ struct flush_info *fi = container_of(ws, struct flush_info, flush_work);
+ struct mddev *mddev = fi->mddev;
+ struct bio *bio = fi->bio;
+
+ bio->bi_opf &= ~REQ_PREFLUSH;
+ md_handle_request(mddev, bio);
-static void md_end_flush(struct bio *bio)
+ mempool_free(fi, mddev->flush_pool);
+}
+
+static void md_end_flush(struct bio *fbio)
{
- struct md_rdev *rdev = bio->bi_private;
- struct mddev *mddev = rdev->mddev;
+ struct flush_bio *fb = fbio->bi_private;
+ struct md_rdev *rdev = fb->rdev;
+ struct flush_info *fi = fb->fi;
+ struct bio *bio = fi->bio;
+ struct mddev *mddev = fi->mddev;
rdev_dec_pending(rdev, mddev);
- if (atomic_dec_and_test(&mddev->flush_pending)) {
- /* The pre-request flush has finished */
- queue_work(md_wq, &mddev->flush_work);
+ if (atomic_dec_and_test(&fi->flush_pending)) {
+ if (bio->bi_iter.bi_size == 0)
+ /* an empty barrier - all done */
+ bio_endio(bio);
+ else {
+ INIT_WORK(&fi->flush_work, submit_flushes);
+ queue_work(md_wq, &fi->flush_work);
+ }
}
- bio_put(bio);
-}
-static void md_submit_flush_data(struct work_struct *ws);
+ mempool_free(fb, mddev->flush_bio_pool);
+ bio_put(fbio);
+}
-static void submit_flushes(struct work_struct *ws)
+void md_flush_request(struct mddev *mddev, struct bio *bio)
{
- struct mddev *mddev = container_of(ws, struct mddev, flush_work);
struct md_rdev *rdev;
+ struct flush_info *fi;
+
+ fi = mempool_alloc(mddev->flush_pool, GFP_NOIO);
+
+ fi->bio = bio;
+ fi->mddev = mddev;
+ atomic_set(&fi->flush_pending, 1);
- INIT_WORK(&mddev->flush_work, md_submit_flush_data);
- atomic_set(&mddev->flush_pending, 1);
rcu_read_lock();
rdev_for_each_rcu(rdev, mddev)
if (rdev->raid_disk >= 0 &&
@@ -445,59 +488,39 @@ static void submit_flushes(struct work_struct *ws)
* we reclaim rcu_read_lock
*/
struct bio *bi;
+ struct flush_bio *fb;
atomic_inc(&rdev->nr_pending);
atomic_inc(&rdev->nr_pending);
rcu_read_unlock();
+
+ fb = mempool_alloc(mddev->flush_bio_pool, GFP_NOIO);
+ fb->fi = fi;
+ fb->rdev = rdev;
+
bi = bio_alloc_mddev(GFP_NOIO, 0, mddev);
- bi->bi_end_io = md_end_flush;
- bi->bi_private = rdev;
bio_set_dev(bi, rdev->bdev);
+ bi->bi_end_io = md_end_flush;
+ bi->bi_private = fb;
bi->bi_opf = REQ_OP_WRITE | REQ_PREFLUSH;
- atomic_inc(&mddev->flush_pending);
+
+ atomic_inc(&fi->flush_pending);
submit_bio(bi);
+
rcu_read_lock();
rdev_dec_pending(rdev, mddev);
}
rcu_read_unlock();
- if (atomic_dec_and_test(&mddev->flush_pending))
- queue_work(md_wq, &mddev->flush_work);
-}
-
-static void md_submit_flush_data(struct work_struct *ws)
-{
- struct mddev *mddev = container_of(ws, struct mddev, flush_work);
- struct bio *bio = mddev->flush_bio;
-
- /*
- * must reset flush_bio before calling into md_handle_request to avoid a
- * deadlock, because other bios passed md_handle_request suspend check
- * could wait for this and below md_handle_request could wait for those
- * bios because of suspend check
- */
- mddev->flush_bio = NULL;
- wake_up(&mddev->sb_wait);
- if (bio->bi_iter.bi_size == 0)
- /* an empty barrier - all done */
- bio_endio(bio);
- else {
- bio->bi_opf &= ~REQ_PREFLUSH;
- md_handle_request(mddev, bio);
+ if (atomic_dec_and_test(&fi->flush_pending)) {
+ if (bio->bi_iter.bi_size == 0)
+ /* an empty barrier - all done */
+ bio_endio(bio);
+ else {
+ INIT_WORK(&fi->flush_work, submit_flushes);
+ queue_work(md_wq, &fi->flush_work);
+ }
}
}
-
-void md_flush_request(struct mddev *mddev, struct bio *bio)
-{
- spin_lock_irq(&mddev->lock);
- wait_event_lock_irq(mddev->sb_wait,
- !mddev->flush_bio,
- mddev->lock);
- mddev->flush_bio = bio;
- spin_unlock_irq(&mddev->lock);
-
- INIT_WORK(&mddev->flush_work, submit_flushes);
- queue_work(md_wq, &mddev->flush_work);
-}
EXPORT_SYMBOL(md_flush_request);
static inline struct mddev *mddev_get(struct mddev *mddev)
@@ -510,8 +533,6 @@ static void mddev_delayed_delete(struct work_struct *ws);
static void mddev_put(struct mddev *mddev)
{
- struct bio_set *bs = NULL, *sync_bs = NULL;
-
if (!atomic_dec_and_lock(&mddev->active, &all_mddevs_lock))
return;
if (!mddev->raid_disks && list_empty(&mddev->disks) &&
@@ -519,32 +540,23 @@ static void mddev_put(struct mddev *mddev)
/* Array is not configured at all, and not held active,
* so destroy it */
list_del_init(&mddev->all_mddevs);
- bs = mddev->bio_set;
- sync_bs = mddev->sync_set;
- mddev->bio_set = NULL;
- mddev->sync_set = NULL;
- if (mddev->gendisk) {
- /* We did a probe so need to clean up. Call
- * queue_work inside the spinlock so that
- * flush_workqueue() after mddev_find will
- * succeed in waiting for the work to be done.
- */
- INIT_WORK(&mddev->del_work, mddev_delayed_delete);
- queue_work(md_misc_wq, &mddev->del_work);
- } else
- kfree(mddev);
+
+ /*
+ * Call queue_work inside the spinlock so that
+ * flush_workqueue() after mddev_find will succeed in waiting
+ * for the work to be done.
+ */
+ INIT_WORK(&mddev->del_work, mddev_delayed_delete);
+ queue_work(md_misc_wq, &mddev->del_work);
}
spin_unlock(&all_mddevs_lock);
- if (bs)
- bioset_free(bs);
- if (sync_bs)
- bioset_free(sync_bs);
}
static void md_safemode_timeout(struct timer_list *t);
void mddev_init(struct mddev *mddev)
{
+ kobject_init(&mddev->kobj, &md_ktype);
mutex_init(&mddev->open_mutex);
mutex_init(&mddev->reconfig_mutex);
mutex_init(&mddev->bitmap_info.mutex);
@@ -555,7 +567,6 @@ void mddev_init(struct mddev *mddev)
atomic_set(&mddev->openers, 0);
atomic_set(&mddev->active_io, 0);
spin_lock_init(&mddev->lock);
- atomic_set(&mddev->flush_pending, 0);
init_waitqueue_head(&mddev->sb_wait);
init_waitqueue_head(&mddev->recovery_wait);
mddev->reshape_position = MaxSector;
@@ -2123,7 +2134,7 @@ int md_integrity_register(struct mddev *mddev)
bdev_get_integrity(reference->bdev));
pr_debug("md: data integrity enabled on %s\n", mdname(mddev));
- if (bioset_integrity_create(mddev->bio_set, BIO_POOL_SIZE)) {
+ if (bioset_integrity_create(&mddev->bio_set, BIO_POOL_SIZE)) {
pr_err("md: failed to create integrity pool for %s\n",
mdname(mddev));
return -EINVAL;
@@ -2853,7 +2864,8 @@ state_store(struct md_rdev *rdev, const char *buf, size_t len)
err = 0;
}
} else if (cmd_match(buf, "re-add")) {
- if (test_bit(Faulty, &rdev->flags) && (rdev->raid_disk == -1)) {
+ if (test_bit(Faulty, &rdev->flags) && (rdev->raid_disk == -1) &&
+ rdev->saved_raid_disk >= 0) {
/* clear_bit is performed _after_ all the devices
* have their local Faulty bit cleared. If any writes
* happen in the meantime in the local node, they
@@ -5214,6 +5226,8 @@ static void md_free(struct kobject *ko)
put_disk(mddev->gendisk);
percpu_ref_exit(&mddev->writes_pending);
+ bioset_exit(&mddev->bio_set);
+ bioset_exit(&mddev->sync_set);
kfree(mddev);
}
@@ -5347,8 +5361,7 @@ static int md_alloc(dev_t dev, char *name)
mutex_lock(&mddev->open_mutex);
add_disk(disk);
- error = kobject_init_and_add(&mddev->kobj, &md_ktype,
- &disk_to_dev(disk)->kobj, "%s", "md");
+ error = kobject_add(&mddev->kobj, &disk_to_dev(disk)->kobj, "%s", "md");
if (error) {
/* This isn't possible, but as kobject_init_and_add is marked
* __must_check, we must do something with the result
@@ -5497,14 +5510,28 @@ int md_run(struct mddev *mddev)
sysfs_notify_dirent_safe(rdev->sysfs_state);
}
- if (mddev->bio_set == NULL) {
- mddev->bio_set = bioset_create(BIO_POOL_SIZE, 0, BIOSET_NEED_BVECS);
- if (!mddev->bio_set)
- return -ENOMEM;
+ if (!bioset_initialized(&mddev->bio_set)) {
+ err = bioset_init(&mddev->bio_set, BIO_POOL_SIZE, 0, BIOSET_NEED_BVECS);
+ if (err)
+ return err;
+ }
+ if (!bioset_initialized(&mddev->sync_set)) {
+ err = bioset_init(&mddev->sync_set, BIO_POOL_SIZE, 0, BIOSET_NEED_BVECS);
+ if (err)
+ return err;
+ }
+ if (mddev->flush_pool == NULL) {
+ mddev->flush_pool = mempool_create(NR_FLUSH_INFOS, flush_info_alloc,
+ flush_info_free, mddev);
+ if (!mddev->flush_pool) {
+ err = -ENOMEM;
+ goto abort;
+ }
}
- if (mddev->sync_set == NULL) {
- mddev->sync_set = bioset_create(BIO_POOL_SIZE, 0, BIOSET_NEED_BVECS);
- if (!mddev->sync_set) {
+ if (mddev->flush_bio_pool == NULL) {
+ mddev->flush_bio_pool = mempool_create(NR_FLUSH_BIOS, flush_bio_alloc,
+ flush_bio_free, mddev);
+ if (!mddev->flush_bio_pool) {
err = -ENOMEM;
goto abort;
}
@@ -5520,8 +5547,7 @@ int md_run(struct mddev *mddev)
else
pr_warn("md: personality for level %s is not loaded!\n",
mddev->clevel);
- err = -EINVAL;
- goto abort;
+ return -EINVAL;
}
spin_unlock(&pers_lock);
if (mddev->level != pers->level) {
@@ -5534,8 +5560,7 @@ int md_run(struct mddev *mddev)
pers->start_reshape == NULL) {
/* This personality cannot handle reshaping... */
module_put(pers->owner);
- err = -EINVAL;
- goto abort;
+ return -EINVAL;
}
if (pers->sync_request) {
@@ -5604,7 +5629,7 @@ int md_run(struct mddev *mddev)
mddev->private = NULL;
module_put(pers->owner);
bitmap_destroy(mddev);
- goto abort;
+ return err;
}
if (mddev->queue) {
bool nonrot = true;
@@ -5668,13 +5693,13 @@ int md_run(struct mddev *mddev)
return 0;
abort:
- if (mddev->bio_set) {
- bioset_free(mddev->bio_set);
- mddev->bio_set = NULL;
+ if (mddev->flush_bio_pool) {
+ mempool_destroy(mddev->flush_bio_pool);
+ mddev->flush_bio_pool = NULL;
}
- if (mddev->sync_set) {
- bioset_free(mddev->sync_set);
- mddev->sync_set = NULL;
+ if (mddev->flush_pool){
+ mempool_destroy(mddev->flush_pool);
+ mddev->flush_pool = NULL;
}
return err;
@@ -5888,14 +5913,16 @@ void md_stop(struct mddev *mddev)
* This is called from dm-raid
*/
__md_stop(mddev);
- if (mddev->bio_set) {
- bioset_free(mddev->bio_set);
- mddev->bio_set = NULL;
+ if (mddev->flush_bio_pool) {
+ mempool_destroy(mddev->flush_bio_pool);
+ mddev->flush_bio_pool = NULL;
}
- if (mddev->sync_set) {
- bioset_free(mddev->sync_set);
- mddev->sync_set = NULL;
+ if (mddev->flush_pool) {
+ mempool_destroy(mddev->flush_pool);
+ mddev->flush_pool = NULL;
}
+ bioset_exit(&mddev->bio_set);
+ bioset_exit(&mddev->sync_set);
}
EXPORT_SYMBOL_GPL(md_stop);
@@ -6524,6 +6551,9 @@ static int hot_remove_disk(struct mddev *mddev, dev_t dev)
char b[BDEVNAME_SIZE];
struct md_rdev *rdev;
+ if (!mddev->pers)
+ return -ENODEV;
+
rdev = find_rdev(mddev, dev);
if (!rdev)
return -ENXIO;
@@ -8641,6 +8671,7 @@ static int remove_and_add_spares(struct mddev *mddev,
if (mddev->pers->hot_remove_disk(
mddev, rdev) == 0) {
sysfs_unlink_rdev(mddev, rdev);
+ rdev->saved_raid_disk = rdev->raid_disk;
rdev->raid_disk = -1;
removed++;
}
diff --git a/drivers/md/md.h b/drivers/md/md.h
index fbc925cce810..2d148bdaba74 100644
--- a/drivers/md/md.h
+++ b/drivers/md/md.h
@@ -252,6 +252,19 @@ enum mddev_sb_flags {
MD_SB_NEED_REWRITE, /* metadata write needs to be repeated */
};
+#define NR_FLUSH_INFOS 8
+#define NR_FLUSH_BIOS 64
+struct flush_info {
+ struct bio *bio;
+ struct mddev *mddev;
+ struct work_struct flush_work;
+ atomic_t flush_pending;
+};
+struct flush_bio {
+ struct flush_info *fi;
+ struct md_rdev *rdev;
+};
+
struct mddev {
void *private;
struct md_personality *pers;
@@ -452,18 +465,13 @@ struct mddev {
struct attribute_group *to_remove;
- struct bio_set *bio_set;
- struct bio_set *sync_set; /* for sync operations like
+ struct bio_set bio_set;
+ struct bio_set sync_set; /* for sync operations like
* metadata and bitmap writes
*/
- /* Generic flush handling.
- * The last to finish preflush schedules a worker to submit
- * the rest of the request (without the REQ_PREFLUSH flag).
- */
- struct bio *flush_bio;
- atomic_t flush_pending;
- struct work_struct flush_work;
+ mempool_t *flush_pool;
+ mempool_t *flush_bio_pool;
struct work_struct event_work; /* used by dm to report failure event */
void (*sync_super)(struct mddev *mddev, struct md_rdev *rdev);
struct md_cluster_info *cluster_info;
diff --git a/drivers/md/raid0.c b/drivers/md/raid0.c
index 584c10347267..ac1cffd2a09b 100644
--- a/drivers/md/raid0.c
+++ b/drivers/md/raid0.c
@@ -159,12 +159,14 @@ static int create_strip_zones(struct mddev *mddev, struct r0conf **private_conf)
}
err = -ENOMEM;
- conf->strip_zone = kzalloc(sizeof(struct strip_zone)*
- conf->nr_strip_zones, GFP_KERNEL);
+ conf->strip_zone = kcalloc(conf->nr_strip_zones,
+ sizeof(struct strip_zone),
+ GFP_KERNEL);
if (!conf->strip_zone)
goto abort;
- conf->devlist = kzalloc(sizeof(struct md_rdev*)*
- conf->nr_strip_zones*mddev->raid_disks,
+ conf->devlist = kzalloc(array3_size(sizeof(struct md_rdev *),
+ conf->nr_strip_zones,
+ mddev->raid_disks),
GFP_KERNEL);
if (!conf->devlist)
goto abort;
@@ -479,7 +481,7 @@ static void raid0_handle_discard(struct mddev *mddev, struct bio *bio)
if (bio_end_sector(bio) > zone->zone_end) {
struct bio *split = bio_split(bio,
zone->zone_end - bio->bi_iter.bi_sector, GFP_NOIO,
- mddev->bio_set);
+ &mddev->bio_set);
bio_chain(split, bio);
generic_make_request(bio);
bio = split;
@@ -582,7 +584,8 @@ static bool raid0_make_request(struct mddev *mddev, struct bio *bio)
sector = bio_sector;
if (sectors < bio_sectors(bio)) {
- struct bio *split = bio_split(bio, sectors, GFP_NOIO, mddev->bio_set);
+ struct bio *split = bio_split(bio, sectors, GFP_NOIO,
+ &mddev->bio_set);
bio_chain(split, bio);
generic_make_request(bio);
bio = split;
diff --git a/drivers/md/raid1.c b/drivers/md/raid1.c
index e9e3308cb0a7..8e05c1092aef 100644
--- a/drivers/md/raid1.c
+++ b/drivers/md/raid1.c
@@ -126,8 +126,8 @@ static void * r1buf_pool_alloc(gfp_t gfp_flags, void *data)
if (!r1_bio)
return NULL;
- rps = kmalloc(sizeof(struct resync_pages) * pi->raid_disks,
- gfp_flags);
+ rps = kmalloc_array(pi->raid_disks, sizeof(struct resync_pages),
+ gfp_flags);
if (!rps)
goto out_free_r1bio;
@@ -221,7 +221,7 @@ static void free_r1bio(struct r1bio *r1_bio)
struct r1conf *conf = r1_bio->mddev->private;
put_all_bios(conf, r1_bio);
- mempool_free(r1_bio, conf->r1bio_pool);
+ mempool_free(r1_bio, &conf->r1bio_pool);
}
static void put_buf(struct r1bio *r1_bio)
@@ -236,7 +236,7 @@ static void put_buf(struct r1bio *r1_bio)
rdev_dec_pending(conf->mirrors[i].rdev, r1_bio->mddev);
}
- mempool_free(r1_bio, conf->r1buf_pool);
+ mempool_free(r1_bio, &conf->r1buf_pool);
lower_barrier(conf, sect);
}
@@ -1178,7 +1178,7 @@ alloc_r1bio(struct mddev *mddev, struct bio *bio)
struct r1conf *conf = mddev->private;
struct r1bio *r1_bio;
- r1_bio = mempool_alloc(conf->r1bio_pool, GFP_NOIO);
+ r1_bio = mempool_alloc(&conf->r1bio_pool, GFP_NOIO);
/* Ensure no bio records IO_BLOCKED */
memset(r1_bio->bios, 0, conf->raid_disks * sizeof(r1_bio->bios[0]));
init_r1bio(r1_bio, mddev, bio);
@@ -1268,7 +1268,7 @@ static void raid1_read_request(struct mddev *mddev, struct bio *bio,
if (max_sectors < bio_sectors(bio)) {
struct bio *split = bio_split(bio, max_sectors,
- gfp, conf->bio_split);
+ gfp, &conf->bio_split);
bio_chain(split, bio);
generic_make_request(bio);
bio = split;
@@ -1278,7 +1278,7 @@ static void raid1_read_request(struct mddev *mddev, struct bio *bio,
r1_bio->read_disk = rdisk;
- read_bio = bio_clone_fast(bio, gfp, mddev->bio_set);
+ read_bio = bio_clone_fast(bio, gfp, &mddev->bio_set);
r1_bio->bios[rdisk] = read_bio;
@@ -1439,7 +1439,7 @@ static void raid1_write_request(struct mddev *mddev, struct bio *bio,
if (max_sectors < bio_sectors(bio)) {
struct bio *split = bio_split(bio, max_sectors,
- GFP_NOIO, conf->bio_split);
+ GFP_NOIO, &conf->bio_split);
bio_chain(split, bio);
generic_make_request(bio);
bio = split;
@@ -1479,9 +1479,9 @@ static void raid1_write_request(struct mddev *mddev, struct bio *bio,
if (r1_bio->behind_master_bio)
mbio = bio_clone_fast(r1_bio->behind_master_bio,
- GFP_NOIO, mddev->bio_set);
+ GFP_NOIO, &mddev->bio_set);
else
- mbio = bio_clone_fast(bio, GFP_NOIO, mddev->bio_set);
+ mbio = bio_clone_fast(bio, GFP_NOIO, &mddev->bio_set);
if (r1_bio->behind_master_bio) {
if (test_bit(WriteMostly, &conf->mirrors[i].rdev->flags))
@@ -1657,8 +1657,7 @@ static void close_sync(struct r1conf *conf)
_allow_barrier(conf, idx);
}
- mempool_destroy(conf->r1buf_pool);
- conf->r1buf_pool = NULL;
+ mempool_exit(&conf->r1buf_pool);
}
static int raid1_spare_active(struct mddev *mddev)
@@ -2348,10 +2347,10 @@ static int narrow_write_error(struct r1bio *r1_bio, int i)
if (test_bit(R1BIO_BehindIO, &r1_bio->state)) {
wbio = bio_clone_fast(r1_bio->behind_master_bio,
GFP_NOIO,
- mddev->bio_set);
+ &mddev->bio_set);
} else {
wbio = bio_clone_fast(r1_bio->master_bio, GFP_NOIO,
- mddev->bio_set);
+ &mddev->bio_set);
}
bio_set_op_attrs(wbio, REQ_OP_WRITE, 0);
@@ -2450,7 +2449,6 @@ static void handle_read_error(struct r1conf *conf, struct r1bio *r1_bio)
struct mddev *mddev = conf->mddev;
struct bio *bio;
struct md_rdev *rdev;
- sector_t bio_sector;
clear_bit(R1BIO_ReadError, &r1_bio->state);
/* we got a read error. Maybe the drive is bad. Maybe just
@@ -2463,7 +2461,6 @@ static void handle_read_error(struct r1conf *conf, struct r1bio *r1_bio)
*/
bio = r1_bio->bios[r1_bio->read_disk];
- bio_sector = conf->mirrors[r1_bio->read_disk].rdev->data_offset + r1_bio->sector;
bio_put(bio);
r1_bio->bios[r1_bio->read_disk] = NULL;
@@ -2474,6 +2471,8 @@ static void handle_read_error(struct r1conf *conf, struct r1bio *r1_bio)
fix_read_error(conf, r1_bio->read_disk,
r1_bio->sector, r1_bio->sectors);
unfreeze_array(conf);
+ } else if (mddev->ro == 0 && test_bit(FailFast, &rdev->flags)) {
+ md_error(mddev, rdev);
} else {
r1_bio->bios[r1_bio->read_disk] = IO_BLOCKED;
}
@@ -2564,17 +2563,15 @@ static int init_resync(struct r1conf *conf)
int buffs;
buffs = RESYNC_WINDOW / RESYNC_BLOCK_SIZE;
- BUG_ON(conf->r1buf_pool);
- conf->r1buf_pool = mempool_create(buffs, r1buf_pool_alloc, r1buf_pool_free,
- conf->poolinfo);
- if (!conf->r1buf_pool)
- return -ENOMEM;
- return 0;
+ BUG_ON(mempool_initialized(&conf->r1buf_pool));
+
+ return mempool_init(&conf->r1buf_pool, buffs, r1buf_pool_alloc,
+ r1buf_pool_free, conf->poolinfo);
}
static struct r1bio *raid1_alloc_init_r1buf(struct r1conf *conf)
{
- struct r1bio *r1bio = mempool_alloc(conf->r1buf_pool, GFP_NOIO);
+ struct r1bio *r1bio = mempool_alloc(&conf->r1buf_pool, GFP_NOIO);
struct resync_pages *rps;
struct bio *bio;
int i;
@@ -2617,7 +2614,7 @@ static sector_t raid1_sync_request(struct mddev *mddev, sector_t sector_nr,
int idx = sector_to_idx(sector_nr);
int page_idx = 0;
- if (!conf->r1buf_pool)
+ if (!mempool_initialized(&conf->r1buf_pool))
if (init_resync(conf))
return 0;
@@ -2939,9 +2936,9 @@ static struct r1conf *setup_conf(struct mddev *mddev)
if (!conf->barrier)
goto abort;
- conf->mirrors = kzalloc(sizeof(struct raid1_info)
- * mddev->raid_disks * 2,
- GFP_KERNEL);
+ conf->mirrors = kzalloc(array3_size(sizeof(struct raid1_info),
+ mddev->raid_disks, 2),
+ GFP_KERNEL);
if (!conf->mirrors)
goto abort;
@@ -2953,14 +2950,13 @@ static struct r1conf *setup_conf(struct mddev *mddev)
if (!conf->poolinfo)
goto abort;
conf->poolinfo->raid_disks = mddev->raid_disks * 2;
- conf->r1bio_pool = mempool_create(NR_RAID1_BIOS, r1bio_pool_alloc,
- r1bio_pool_free,
- conf->poolinfo);
- if (!conf->r1bio_pool)
+ err = mempool_init(&conf->r1bio_pool, NR_RAID1_BIOS, r1bio_pool_alloc,
+ r1bio_pool_free, conf->poolinfo);
+ if (err)
goto abort;
- conf->bio_split = bioset_create(BIO_POOL_SIZE, 0, 0);
- if (!conf->bio_split)
+ err = bioset_init(&conf->bio_split, BIO_POOL_SIZE, 0, 0);
+ if (err)
goto abort;
conf->poolinfo->mddev = mddev;
@@ -3033,7 +3029,7 @@ static struct r1conf *setup_conf(struct mddev *mddev)
abort:
if (conf) {
- mempool_destroy(conf->r1bio_pool);
+ mempool_exit(&conf->r1bio_pool);
kfree(conf->mirrors);
safe_put_page(conf->tmppage);
kfree(conf->poolinfo);
@@ -3041,8 +3037,7 @@ static struct r1conf *setup_conf(struct mddev *mddev)
kfree(conf->nr_waiting);
kfree(conf->nr_queued);
kfree(conf->barrier);
- if (conf->bio_split)
- bioset_free(conf->bio_split);
+ bioset_exit(&conf->bio_split);
kfree(conf);
}
return ERR_PTR(err);
@@ -3144,7 +3139,7 @@ static void raid1_free(struct mddev *mddev, void *priv)
{
struct r1conf *conf = priv;
- mempool_destroy(conf->r1bio_pool);
+ mempool_exit(&conf->r1bio_pool);
kfree(conf->mirrors);
safe_put_page(conf->tmppage);
kfree(conf->poolinfo);
@@ -3152,8 +3147,7 @@ static void raid1_free(struct mddev *mddev, void *priv)
kfree(conf->nr_waiting);
kfree(conf->nr_queued);
kfree(conf->barrier);
- if (conf->bio_split)
- bioset_free(conf->bio_split);
+ bioset_exit(&conf->bio_split);
kfree(conf);
}
@@ -3199,13 +3193,17 @@ static int raid1_reshape(struct mddev *mddev)
* At the same time, we "pack" the devices so that all the missing
* devices have the higher raid_disk numbers.
*/
- mempool_t *newpool, *oldpool;
+ mempool_t newpool, oldpool;
struct pool_info *newpoolinfo;
struct raid1_info *newmirrors;
struct r1conf *conf = mddev->private;
int cnt, raid_disks;
unsigned long flags;
int d, d2;
+ int ret;
+
+ memset(&newpool, 0, sizeof(newpool));
+ memset(&oldpool, 0, sizeof(oldpool));
/* Cannot change chunk_size, layout, or level */
if (mddev->chunk_sectors != mddev->new_chunk_sectors ||
@@ -3237,17 +3235,18 @@ static int raid1_reshape(struct mddev *mddev)
newpoolinfo->mddev = mddev;
newpoolinfo->raid_disks = raid_disks * 2;
- newpool = mempool_create(NR_RAID1_BIOS, r1bio_pool_alloc,
- r1bio_pool_free, newpoolinfo);
- if (!newpool) {
+ ret = mempool_init(&newpool, NR_RAID1_BIOS, r1bio_pool_alloc,
+ r1bio_pool_free, newpoolinfo);
+ if (ret) {
kfree(newpoolinfo);
- return -ENOMEM;
+ return ret;
}
- newmirrors = kzalloc(sizeof(struct raid1_info) * raid_disks * 2,
+ newmirrors = kzalloc(array3_size(sizeof(struct raid1_info),
+ raid_disks, 2),
GFP_KERNEL);
if (!newmirrors) {
kfree(newpoolinfo);
- mempool_destroy(newpool);
+ mempool_exit(&newpool);
return -ENOMEM;
}
@@ -3287,7 +3286,7 @@ static int raid1_reshape(struct mddev *mddev)
set_bit(MD_RECOVERY_NEEDED, &mddev->recovery);
md_wakeup_thread(mddev->thread);
- mempool_destroy(oldpool);
+ mempool_exit(&oldpool);
return 0;
}
diff --git a/drivers/md/raid1.h b/drivers/md/raid1.h
index eb84bc68e2fd..e7ccad898736 100644
--- a/drivers/md/raid1.h
+++ b/drivers/md/raid1.h
@@ -118,10 +118,10 @@ struct r1conf {
* mempools - it changes when the array grows or shrinks
*/
struct pool_info *poolinfo;
- mempool_t *r1bio_pool;
- mempool_t *r1buf_pool;
+ mempool_t r1bio_pool;
+ mempool_t r1buf_pool;
- struct bio_set *bio_split;
+ struct bio_set bio_split;
/* temporary buffer to synchronous IO when attempting to repair
* a read error.
diff --git a/drivers/md/raid10.c b/drivers/md/raid10.c
index 3c60774c8430..478cf446827f 100644
--- a/drivers/md/raid10.c
+++ b/drivers/md/raid10.c
@@ -175,7 +175,7 @@ static void * r10buf_pool_alloc(gfp_t gfp_flags, void *data)
nalloc_rp = nalloc;
else
nalloc_rp = nalloc * 2;
- rps = kmalloc(sizeof(struct resync_pages) * nalloc_rp, gfp_flags);
+ rps = kmalloc_array(nalloc_rp, sizeof(struct resync_pages), gfp_flags);
if (!rps)
goto out_free_r10bio;
@@ -255,9 +255,11 @@ static void r10buf_pool_free(void *__r10_bio, void *data)
for (j = conf->copies; j--; ) {
struct bio *bio = r10bio->devs[j].bio;
- rp = get_resync_pages(bio);
- resync_free_pages(rp);
- bio_put(bio);
+ if (bio) {
+ rp = get_resync_pages(bio);
+ resync_free_pages(rp);
+ bio_put(bio);
+ }
bio = r10bio->devs[j].repl_bio;
if (bio)
@@ -291,14 +293,14 @@ static void free_r10bio(struct r10bio *r10_bio)
struct r10conf *conf = r10_bio->mddev->private;
put_all_bios(conf, r10_bio);
- mempool_free(r10_bio, conf->r10bio_pool);
+ mempool_free(r10_bio, &conf->r10bio_pool);
}
static void put_buf(struct r10bio *r10_bio)
{
struct r10conf *conf = r10_bio->mddev->private;
- mempool_free(r10_bio, conf->r10buf_pool);
+ mempool_free(r10_bio, &conf->r10buf_pool);
lower_barrier(conf);
}
@@ -1204,7 +1206,7 @@ static void raid10_read_request(struct mddev *mddev, struct bio *bio,
(unsigned long long)r10_bio->sector);
if (max_sectors < bio_sectors(bio)) {
struct bio *split = bio_split(bio, max_sectors,
- gfp, conf->bio_split);
+ gfp, &conf->bio_split);
bio_chain(split, bio);
generic_make_request(bio);
bio = split;
@@ -1213,7 +1215,7 @@ static void raid10_read_request(struct mddev *mddev, struct bio *bio,
}
slot = r10_bio->read_slot;
- read_bio = bio_clone_fast(bio, gfp, mddev->bio_set);
+ read_bio = bio_clone_fast(bio, gfp, &mddev->bio_set);
r10_bio->devs[slot].bio = read_bio;
r10_bio->devs[slot].rdev = rdev;
@@ -1261,7 +1263,7 @@ static void raid10_write_one_disk(struct mddev *mddev, struct r10bio *r10_bio,
} else
rdev = conf->mirrors[devnum].rdev;
- mbio = bio_clone_fast(bio, GFP_NOIO, mddev->bio_set);
+ mbio = bio_clone_fast(bio, GFP_NOIO, &mddev->bio_set);
if (replacement)
r10_bio->devs[n_copy].repl_bio = mbio;
else
@@ -1509,7 +1511,7 @@ retry_write:
if (r10_bio->sectors < bio_sectors(bio)) {
struct bio *split = bio_split(bio, r10_bio->sectors,
- GFP_NOIO, conf->bio_split);
+ GFP_NOIO, &conf->bio_split);
bio_chain(split, bio);
generic_make_request(bio);
bio = split;
@@ -1533,7 +1535,7 @@ static void __make_request(struct mddev *mddev, struct bio *bio, int sectors)
struct r10conf *conf = mddev->private;
struct r10bio *r10_bio;
- r10_bio = mempool_alloc(conf->r10bio_pool, GFP_NOIO);
+ r10_bio = mempool_alloc(&conf->r10bio_pool, GFP_NOIO);
r10_bio->master_bio = bio;
r10_bio->sectors = sectors;
@@ -1732,8 +1734,7 @@ static void close_sync(struct r10conf *conf)
wait_barrier(conf);
allow_barrier(conf);
- mempool_destroy(conf->r10buf_pool);
- conf->r10buf_pool = NULL;
+ mempool_exit(&conf->r10buf_pool);
}
static int raid10_spare_active(struct mddev *mddev)
@@ -2363,7 +2364,7 @@ static void fix_read_error(struct r10conf *conf, struct mddev *mddev, struct r10
{
int sect = 0; /* Offset from r10_bio->sector */
int sectors = r10_bio->sectors;
- struct md_rdev*rdev;
+ struct md_rdev *rdev;
int max_read_errors = atomic_read(&mddev->max_corr_read_errors);
int d = r10_bio->devs[r10_bio->read_slot].devnum;
@@ -2583,7 +2584,7 @@ static int narrow_write_error(struct r10bio *r10_bio, int i)
if (sectors > sect_to_write)
sectors = sect_to_write;
/* Write at 'sector' for 'sectors' */
- wbio = bio_clone_fast(bio, GFP_NOIO, mddev->bio_set);
+ wbio = bio_clone_fast(bio, GFP_NOIO, &mddev->bio_set);
bio_trim(wbio, sector - bio->bi_iter.bi_sector, sectors);
wsector = r10_bio->devs[i].addr + (sector - r10_bio->sector);
wbio->bi_iter.bi_sector = wsector +
@@ -2816,25 +2817,25 @@ static void raid10d(struct md_thread *thread)
static int init_resync(struct r10conf *conf)
{
- int buffs;
- int i;
+ int ret, buffs, i;
buffs = RESYNC_WINDOW / RESYNC_BLOCK_SIZE;
- BUG_ON(conf->r10buf_pool);
+ BUG_ON(mempool_initialized(&conf->r10buf_pool));
conf->have_replacement = 0;
for (i = 0; i < conf->geo.raid_disks; i++)
if (conf->mirrors[i].replacement)
conf->have_replacement = 1;
- conf->r10buf_pool = mempool_create(buffs, r10buf_pool_alloc, r10buf_pool_free, conf);
- if (!conf->r10buf_pool)
- return -ENOMEM;
+ ret = mempool_init(&conf->r10buf_pool, buffs,
+ r10buf_pool_alloc, r10buf_pool_free, conf);
+ if (ret)
+ return ret;
conf->next_resync = 0;
return 0;
}
static struct r10bio *raid10_alloc_init_r10buf(struct r10conf *conf)
{
- struct r10bio *r10bio = mempool_alloc(conf->r10buf_pool, GFP_NOIO);
+ struct r10bio *r10bio = mempool_alloc(&conf->r10buf_pool, GFP_NOIO);
struct rsync_pages *rp;
struct bio *bio;
int nalloc;
@@ -2945,7 +2946,7 @@ static sector_t raid10_sync_request(struct mddev *mddev, sector_t sector_nr,
sector_t chunk_mask = conf->geo.chunk_mask;
int page_idx = 0;
- if (!conf->r10buf_pool)
+ if (!mempool_initialized(&conf->r10buf_pool))
if (init_resync(conf))
return 0;
@@ -3687,8 +3688,8 @@ static struct r10conf *setup_conf(struct mddev *mddev)
goto out;
/* FIXME calc properly */
- conf->mirrors = kzalloc(sizeof(struct raid10_info)*(mddev->raid_disks +
- max(0,-mddev->delta_disks)),
+ conf->mirrors = kcalloc(mddev->raid_disks + max(0, -mddev->delta_disks),
+ sizeof(struct raid10_info),
GFP_KERNEL);
if (!conf->mirrors)
goto out;
@@ -3699,13 +3700,13 @@ static struct r10conf *setup_conf(struct mddev *mddev)
conf->geo = geo;
conf->copies = copies;
- conf->r10bio_pool = mempool_create(NR_RAID10_BIOS, r10bio_pool_alloc,
- r10bio_pool_free, conf);
- if (!conf->r10bio_pool)
+ err = mempool_init(&conf->r10bio_pool, NR_RAID10_BIOS, r10bio_pool_alloc,
+ r10bio_pool_free, conf);
+ if (err)
goto out;
- conf->bio_split = bioset_create(BIO_POOL_SIZE, 0, 0);
- if (!conf->bio_split)
+ err = bioset_init(&conf->bio_split, BIO_POOL_SIZE, 0, 0);
+ if (err)
goto out;
calc_sectors(conf, mddev->dev_sectors);
@@ -3733,6 +3734,7 @@ static struct r10conf *setup_conf(struct mddev *mddev)
init_waitqueue_head(&conf->wait_barrier);
atomic_set(&conf->nr_pending, 0);
+ err = -ENOMEM;
conf->thread = md_register_thread(raid10d, mddev, "raid10");
if (!conf->thread)
goto out;
@@ -3742,11 +3744,10 @@ static struct r10conf *setup_conf(struct mddev *mddev)
out:
if (conf) {
- mempool_destroy(conf->r10bio_pool);
+ mempool_exit(&conf->r10bio_pool);
kfree(conf->mirrors);
safe_put_page(conf->tmppage);
- if (conf->bio_split)
- bioset_free(conf->bio_split);
+ bioset_exit(&conf->bio_split);
kfree(conf);
}
return ERR_PTR(err);
@@ -3953,7 +3954,7 @@ static int raid10_run(struct mddev *mddev)
out_free_conf:
md_unregister_thread(&mddev->thread);
- mempool_destroy(conf->r10bio_pool);
+ mempool_exit(&conf->r10bio_pool);
safe_put_page(conf->tmppage);
kfree(conf->mirrors);
kfree(conf);
@@ -3966,13 +3967,12 @@ static void raid10_free(struct mddev *mddev, void *priv)
{
struct r10conf *conf = priv;
- mempool_destroy(conf->r10bio_pool);
+ mempool_exit(&conf->r10bio_pool);
safe_put_page(conf->tmppage);
kfree(conf->mirrors);
kfree(conf->mirrors_old);
kfree(conf->mirrors_new);
- if (conf->bio_split)
- bioset_free(conf->bio_split);
+ bioset_exit(&conf->bio_split);
kfree(conf);
}
@@ -4129,11 +4129,10 @@ static int raid10_check_reshape(struct mddev *mddev)
conf->mirrors_new = NULL;
if (mddev->delta_disks > 0) {
/* allocate new 'mirrors' list */
- conf->mirrors_new = kzalloc(
- sizeof(struct raid10_info)
- *(mddev->raid_disks +
- mddev->delta_disks),
- GFP_KERNEL);
+ conf->mirrors_new =
+ kcalloc(mddev->raid_disks + mddev->delta_disks,
+ sizeof(struct raid10_info),
+ GFP_KERNEL);
if (!conf->mirrors_new)
return -ENOMEM;
}
@@ -4543,7 +4542,7 @@ read_more:
* on all the target devices.
*/
// FIXME
- mempool_free(r10_bio, conf->r10buf_pool);
+ mempool_free(r10_bio, &conf->r10buf_pool);
set_bit(MD_RECOVERY_INTR, &mddev->recovery);
return sectors_done;
}
diff --git a/drivers/md/raid10.h b/drivers/md/raid10.h
index e2e8840de9bf..d3eaaf3eb1bc 100644
--- a/drivers/md/raid10.h
+++ b/drivers/md/raid10.h
@@ -93,10 +93,10 @@ struct r10conf {
*/
wait_queue_head_t wait_barrier;
- mempool_t *r10bio_pool;
- mempool_t *r10buf_pool;
+ mempool_t r10bio_pool;
+ mempool_t r10buf_pool;
struct page *tmppage;
- struct bio_set *bio_split;
+ struct bio_set bio_split;
/* When taking over an array from a different personality, we store
* the new thread here until we fully activate the array.
diff --git a/drivers/md/raid5-cache.c b/drivers/md/raid5-cache.c
index 3c65f52b68f5..2b775abf377b 100644
--- a/drivers/md/raid5-cache.c
+++ b/drivers/md/raid5-cache.c
@@ -125,9 +125,9 @@ struct r5l_log {
struct list_head no_mem_stripes; /* pending stripes, -ENOMEM */
struct kmem_cache *io_kc;
- mempool_t *io_pool;
- struct bio_set *bs;
- mempool_t *meta_pool;
+ mempool_t io_pool;
+ struct bio_set bs;
+ mempool_t meta_pool;
struct md_thread *reclaim_thread;
unsigned long reclaim_target; /* number of space that need to be
@@ -579,7 +579,7 @@ static void r5l_log_endio(struct bio *bio)
md_error(log->rdev->mddev, log->rdev);
bio_put(bio);
- mempool_free(io->meta_page, log->meta_pool);
+ mempool_free(io->meta_page, &log->meta_pool);
spin_lock_irqsave(&log->io_list_lock, flags);
__r5l_set_io_unit_state(io, IO_UNIT_IO_END);
@@ -748,7 +748,7 @@ static void r5l_submit_current_io(struct r5l_log *log)
static struct bio *r5l_bio_alloc(struct r5l_log *log)
{
- struct bio *bio = bio_alloc_bioset(GFP_NOIO, BIO_MAX_PAGES, log->bs);
+ struct bio *bio = bio_alloc_bioset(GFP_NOIO, BIO_MAX_PAGES, &log->bs);
bio_set_op_attrs(bio, REQ_OP_WRITE, 0);
bio_set_dev(bio, log->rdev->bdev);
@@ -780,7 +780,7 @@ static struct r5l_io_unit *r5l_new_meta(struct r5l_log *log)
struct r5l_io_unit *io;
struct r5l_meta_block *block;
- io = mempool_alloc(log->io_pool, GFP_ATOMIC);
+ io = mempool_alloc(&log->io_pool, GFP_ATOMIC);
if (!io)
return NULL;
memset(io, 0, sizeof(*io));
@@ -791,7 +791,7 @@ static struct r5l_io_unit *r5l_new_meta(struct r5l_log *log)
bio_list_init(&io->flush_barriers);
io->state = IO_UNIT_RUNNING;
- io->meta_page = mempool_alloc(log->meta_pool, GFP_NOIO);
+ io->meta_page = mempool_alloc(&log->meta_pool, GFP_NOIO);
block = page_address(io->meta_page);
clear_page(block);
block->magic = cpu_to_le32(R5LOG_MAGIC);
@@ -1223,7 +1223,7 @@ static bool r5l_complete_finished_ios(struct r5l_log *log)
log->next_checkpoint = io->log_start;
list_del(&io->log_sibling);
- mempool_free(io, log->io_pool);
+ mempool_free(io, &log->io_pool);
r5l_run_no_mem_stripe(log);
found = true;
@@ -1647,7 +1647,7 @@ static int r5l_recovery_allocate_ra_pool(struct r5l_log *log,
{
struct page *page;
- ctx->ra_bio = bio_alloc_bioset(GFP_KERNEL, BIO_MAX_PAGES, log->bs);
+ ctx->ra_bio = bio_alloc_bioset(GFP_KERNEL, BIO_MAX_PAGES, &log->bs);
if (!ctx->ra_bio)
return -ENOMEM;
@@ -3066,6 +3066,7 @@ int r5l_init_log(struct r5conf *conf, struct md_rdev *rdev)
struct request_queue *q = bdev_get_queue(rdev->bdev);
struct r5l_log *log;
char b[BDEVNAME_SIZE];
+ int ret;
pr_debug("md/raid:%s: using device %s as journal\n",
mdname(conf->mddev), bdevname(rdev->bdev, b));
@@ -3111,16 +3112,16 @@ int r5l_init_log(struct r5conf *conf, struct md_rdev *rdev)
if (!log->io_kc)
goto io_kc;
- log->io_pool = mempool_create_slab_pool(R5L_POOL_SIZE, log->io_kc);
- if (!log->io_pool)
+ ret = mempool_init_slab_pool(&log->io_pool, R5L_POOL_SIZE, log->io_kc);
+ if (ret)
goto io_pool;
- log->bs = bioset_create(R5L_POOL_SIZE, 0, BIOSET_NEED_BVECS);
- if (!log->bs)
+ ret = bioset_init(&log->bs, R5L_POOL_SIZE, 0, BIOSET_NEED_BVECS);
+ if (ret)
goto io_bs;
- log->meta_pool = mempool_create_page_pool(R5L_POOL_SIZE, 0);
- if (!log->meta_pool)
+ ret = mempool_init_page_pool(&log->meta_pool, R5L_POOL_SIZE, 0);
+ if (ret)
goto out_mempool;
spin_lock_init(&log->tree_lock);
@@ -3155,11 +3156,11 @@ int r5l_init_log(struct r5conf *conf, struct md_rdev *rdev)
rcu_assign_pointer(conf->log, NULL);
md_unregister_thread(&log->reclaim_thread);
reclaim_thread:
- mempool_destroy(log->meta_pool);
+ mempool_exit(&log->meta_pool);
out_mempool:
- bioset_free(log->bs);
+ bioset_exit(&log->bs);
io_bs:
- mempool_destroy(log->io_pool);
+ mempool_exit(&log->io_pool);
io_pool:
kmem_cache_destroy(log->io_kc);
io_kc:
@@ -3178,9 +3179,9 @@ void r5l_exit_log(struct r5conf *conf)
wake_up(&conf->mddev->sb_wait);
flush_work(&log->disable_writeback_work);
md_unregister_thread(&log->reclaim_thread);
- mempool_destroy(log->meta_pool);
- bioset_free(log->bs);
- mempool_destroy(log->io_pool);
+ mempool_exit(&log->meta_pool);
+ bioset_exit(&log->bs);
+ mempool_exit(&log->io_pool);
kmem_cache_destroy(log->io_kc);
kfree(log);
}
diff --git a/drivers/md/raid5-ppl.c b/drivers/md/raid5-ppl.c
index 42890a08375b..3a7c36326589 100644
--- a/drivers/md/raid5-ppl.c
+++ b/drivers/md/raid5-ppl.c
@@ -105,9 +105,9 @@ struct ppl_conf {
atomic64_t seq; /* current log write sequence number */
struct kmem_cache *io_kc;
- mempool_t *io_pool;
- struct bio_set *bs;
- struct bio_set *flush_bs;
+ mempool_t io_pool;
+ struct bio_set bs;
+ struct bio_set flush_bs;
/* used only for recovery */
int recovered_entries;
@@ -244,7 +244,7 @@ static struct ppl_io_unit *ppl_new_iounit(struct ppl_log *log,
struct ppl_header *pplhdr;
struct page *header_page;
- io = mempool_alloc(ppl_conf->io_pool, GFP_NOWAIT);
+ io = mempool_alloc(&ppl_conf->io_pool, GFP_NOWAIT);
if (!io)
return NULL;
@@ -503,7 +503,7 @@ static void ppl_submit_iounit(struct ppl_io_unit *io)
struct bio *prev = bio;
bio = bio_alloc_bioset(GFP_NOIO, BIO_MAX_PAGES,
- ppl_conf->bs);
+ &ppl_conf->bs);
bio->bi_opf = prev->bi_opf;
bio_copy_dev(bio, prev);
bio->bi_iter.bi_sector = bio_end_sector(prev);
@@ -570,7 +570,7 @@ static void ppl_io_unit_finished(struct ppl_io_unit *io)
list_del(&io->log_sibling);
spin_unlock(&log->io_list_lock);
- mempool_free(io, ppl_conf->io_pool);
+ mempool_free(io, &ppl_conf->io_pool);
spin_lock(&ppl_conf->no_mem_stripes_lock);
if (!list_empty(&ppl_conf->no_mem_stripes)) {
@@ -642,7 +642,7 @@ static void ppl_do_flush(struct ppl_io_unit *io)
struct bio *bio;
char b[BDEVNAME_SIZE];
- bio = bio_alloc_bioset(GFP_NOIO, 0, ppl_conf->flush_bs);
+ bio = bio_alloc_bioset(GFP_NOIO, 0, &ppl_conf->flush_bs);
bio_set_dev(bio, bdev);
bio->bi_private = io;
bio->bi_opf = REQ_OP_WRITE | REQ_PREFLUSH;
@@ -1246,11 +1246,9 @@ static void __ppl_exit_log(struct ppl_conf *ppl_conf)
kfree(ppl_conf->child_logs);
- if (ppl_conf->bs)
- bioset_free(ppl_conf->bs);
- if (ppl_conf->flush_bs)
- bioset_free(ppl_conf->flush_bs);
- mempool_destroy(ppl_conf->io_pool);
+ bioset_exit(&ppl_conf->bs);
+ bioset_exit(&ppl_conf->flush_bs);
+ mempool_exit(&ppl_conf->io_pool);
kmem_cache_destroy(ppl_conf->io_kc);
kfree(ppl_conf);
@@ -1387,24 +1385,18 @@ int ppl_init_log(struct r5conf *conf)
goto err;
}
- ppl_conf->io_pool = mempool_create(conf->raid_disks, ppl_io_pool_alloc,
- ppl_io_pool_free, ppl_conf->io_kc);
- if (!ppl_conf->io_pool) {
- ret = -ENOMEM;
+ ret = mempool_init(&ppl_conf->io_pool, conf->raid_disks, ppl_io_pool_alloc,
+ ppl_io_pool_free, ppl_conf->io_kc);
+ if (ret)
goto err;
- }
- ppl_conf->bs = bioset_create(conf->raid_disks, 0, BIOSET_NEED_BVECS);
- if (!ppl_conf->bs) {
- ret = -ENOMEM;
+ ret = bioset_init(&ppl_conf->bs, conf->raid_disks, 0, BIOSET_NEED_BVECS);
+ if (ret)
goto err;
- }
- ppl_conf->flush_bs = bioset_create(conf->raid_disks, 0, 0);
- if (!ppl_conf->flush_bs) {
- ret = -ENOMEM;
+ ret = bioset_init(&ppl_conf->flush_bs, conf->raid_disks, 0, 0);
+ if (ret)
goto err;
- }
ppl_conf->count = conf->raid_disks;
ppl_conf->child_logs = kcalloc(ppl_conf->count, sizeof(struct ppl_log),
diff --git a/drivers/md/raid5.c b/drivers/md/raid5.c
index be117d0a65a8..2031506a0ecd 100644
--- a/drivers/md/raid5.c
+++ b/drivers/md/raid5.c
@@ -1139,6 +1139,9 @@ again:
bi->bi_io_vec[0].bv_len = STRIPE_SIZE;
bi->bi_io_vec[0].bv_offset = 0;
bi->bi_iter.bi_size = STRIPE_SIZE;
+ bi->bi_write_hint = sh->dev[i].write_hint;
+ if (!rrdev)
+ sh->dev[i].write_hint = RWF_WRITE_LIFE_NOT_SET;
/*
* If this is discard request, set bi_vcnt 0. We don't
* want to confuse SCSI because SCSI will replace payload
@@ -1190,6 +1193,8 @@ again:
rbi->bi_io_vec[0].bv_len = STRIPE_SIZE;
rbi->bi_io_vec[0].bv_offset = 0;
rbi->bi_iter.bi_size = STRIPE_SIZE;
+ rbi->bi_write_hint = sh->dev[i].write_hint;
+ sh->dev[i].write_hint = RWF_WRITE_LIFE_NOT_SET;
/*
* If this is discard request, set bi_vcnt 0. We don't
* want to confuse SCSI because SCSI will replace payload
@@ -2391,7 +2396,7 @@ static int resize_stripes(struct r5conf *conf, int newsize)
* is completely stalled, so now is a good time to resize
* conf->disks and the scribble region
*/
- ndisks = kzalloc(newsize * sizeof(struct disk_info), GFP_NOIO);
+ ndisks = kcalloc(newsize, sizeof(struct disk_info), GFP_NOIO);
if (ndisks) {
for (i = 0; i < conf->pool_size; i++)
ndisks[i] = conf->disks[i];
@@ -3204,6 +3209,7 @@ static int add_stripe_bio(struct stripe_head *sh, struct bio *bi, int dd_idx,
(unsigned long long)sh->sector);
spin_lock_irq(&sh->stripe_lock);
+ sh->dev[dd_idx].write_hint = bi->bi_write_hint;
/* Don't allow new IO added to stripes in batch list */
if (sh->batch_head)
goto overlap;
@@ -4614,15 +4620,15 @@ static void break_stripe_batch_list(struct stripe_head *head_sh,
sh->check_state = head_sh->check_state;
sh->reconstruct_state = head_sh->reconstruct_state;
+ spin_lock_irq(&sh->stripe_lock);
+ sh->batch_head = NULL;
+ spin_unlock_irq(&sh->stripe_lock);
for (i = 0; i < sh->disks; i++) {
if (test_and_clear_bit(R5_Overlap, &sh->dev[i].flags))
do_wakeup = 1;
sh->dev[i].flags = head_sh->dev[i].flags &
(~((1 << R5_WriteError) | (1 << R5_Overlap)));
}
- spin_lock_irq(&sh->stripe_lock);
- sh->batch_head = NULL;
- spin_unlock_irq(&sh->stripe_lock);
if (handle_flags == 0 ||
sh->state & handle_flags)
set_bit(STRIPE_HANDLE, &sh->state);
@@ -5192,7 +5198,7 @@ static int raid5_read_one_chunk(struct mddev *mddev, struct bio *raid_bio)
/*
* use bio_clone_fast to make a copy of the bio
*/
- align_bi = bio_clone_fast(raid_bio, GFP_NOIO, mddev->bio_set);
+ align_bi = bio_clone_fast(raid_bio, GFP_NOIO, &mddev->bio_set);
if (!align_bi)
return 0;
/*
@@ -5277,7 +5283,7 @@ static struct bio *chunk_aligned_read(struct mddev *mddev, struct bio *raid_bio)
if (sectors < bio_sectors(raid_bio)) {
struct r5conf *conf = mddev->private;
- split = bio_split(raid_bio, sectors, GFP_NOIO, conf->bio_split);
+ split = bio_split(raid_bio, sectors, GFP_NOIO, &conf->bio_split);
bio_chain(split, raid_bio);
generic_make_request(raid_bio);
raid_bio = split;
@@ -6658,9 +6664,9 @@ static int alloc_thread_groups(struct r5conf *conf, int cnt,
}
*group_cnt = num_possible_nodes();
size = sizeof(struct r5worker) * cnt;
- workers = kzalloc(size * *group_cnt, GFP_NOIO);
- *worker_groups = kzalloc(sizeof(struct r5worker_group) *
- *group_cnt, GFP_NOIO);
+ workers = kcalloc(size, *group_cnt, GFP_NOIO);
+ *worker_groups = kcalloc(*group_cnt, sizeof(struct r5worker_group),
+ GFP_NOIO);
if (!*worker_groups || !workers) {
kfree(workers);
kfree(*worker_groups);
@@ -6773,8 +6779,7 @@ static void free_conf(struct r5conf *conf)
if (conf->disks[i].extra_page)
put_page(conf->disks[i].extra_page);
kfree(conf->disks);
- if (conf->bio_split)
- bioset_free(conf->bio_split);
+ bioset_exit(&conf->bio_split);
kfree(conf->stripe_hashtbl);
kfree(conf->pending_data);
kfree(conf);
@@ -6853,6 +6858,7 @@ static struct r5conf *setup_conf(struct mddev *mddev)
int i;
int group_cnt, worker_cnt_per_group;
struct r5worker_group *new_group;
+ int ret;
if (mddev->new_level != 5
&& mddev->new_level != 4
@@ -6888,8 +6894,9 @@ static struct r5conf *setup_conf(struct mddev *mddev)
goto abort;
INIT_LIST_HEAD(&conf->free_list);
INIT_LIST_HEAD(&conf->pending_list);
- conf->pending_data = kzalloc(sizeof(struct r5pending_data) *
- PENDING_IO_MAX, GFP_KERNEL);
+ conf->pending_data = kcalloc(PENDING_IO_MAX,
+ sizeof(struct r5pending_data),
+ GFP_KERNEL);
if (!conf->pending_data)
goto abort;
for (i = 0; i < PENDING_IO_MAX; i++)
@@ -6938,7 +6945,7 @@ static struct r5conf *setup_conf(struct mddev *mddev)
conf->previous_raid_disks = mddev->raid_disks - mddev->delta_disks;
max_disks = max(conf->raid_disks, conf->previous_raid_disks);
- conf->disks = kzalloc(max_disks * sizeof(struct disk_info),
+ conf->disks = kcalloc(max_disks, sizeof(struct disk_info),
GFP_KERNEL);
if (!conf->disks)
@@ -6950,8 +6957,8 @@ static struct r5conf *setup_conf(struct mddev *mddev)
goto abort;
}
- conf->bio_split = bioset_create(BIO_POOL_SIZE, 0, 0);
- if (!conf->bio_split)
+ ret = bioset_init(&conf->bio_split, BIO_POOL_SIZE, 0, 0);
+ if (ret)
goto abort;
conf->mddev = mddev;
diff --git a/drivers/md/raid5.h b/drivers/md/raid5.h
index 3f8da26032ac..8474c224127b 100644
--- a/drivers/md/raid5.h
+++ b/drivers/md/raid5.h
@@ -257,6 +257,7 @@ struct stripe_head {
sector_t sector; /* sector of this page */
unsigned long flags;
u32 log_checksum;
+ unsigned short write_hint;
} dev[1]; /* allocated with extra space depending of RAID geometry */
};
@@ -669,7 +670,7 @@ struct r5conf {
int pool_size; /* number of disks in stripeheads in pool */
spinlock_t device_lock;
struct disk_info *disks;
- struct bio_set *bio_split;
+ struct bio_set bio_split;
/* When taking over an array from a different personality, we store
* the new thread here until we fully activate the array.