aboutsummaryrefslogtreecommitdiff
path: root/drivers/md
diff options
context:
space:
mode:
Diffstat (limited to 'drivers/md')
-rw-r--r--drivers/md/Kconfig50
-rw-r--r--drivers/md/Makefile11
-rw-r--r--drivers/md/bcache/bcache.h7
-rw-r--r--drivers/md/bcache/bset.c1
-rw-r--r--drivers/md/bcache/btree.c9
-rw-r--r--drivers/md/bcache/btree.h2
-rw-r--r--drivers/md/bcache/closure.h1
-rw-r--r--drivers/md/bcache/debug.c2
-rw-r--r--drivers/md/bcache/io.c6
-rw-r--r--drivers/md/bcache/journal.c2
-rw-r--r--drivers/md/bcache/movinggc.c10
-rw-r--r--drivers/md/bcache/request.c40
-rw-r--r--drivers/md/bcache/request.h2
-rw-r--r--drivers/md/bcache/super.c30
-rw-r--r--drivers/md/bcache/sysfs.c1
-rw-r--r--drivers/md/bcache/util.c1
-rw-r--r--drivers/md/bcache/util.h14
-rw-r--r--drivers/md/bcache/writeback.c5
-rw-r--r--drivers/md/bitmap.c70
-rw-r--r--drivers/md/bitmap.h3
-rw-r--r--drivers/md/dm-bio-prison-v1.c (renamed from drivers/md/dm-bio-prison.c)52
-rw-r--r--drivers/md/dm-bio-prison-v1.h (renamed from drivers/md/dm-bio-prison.h)4
-rw-r--r--drivers/md/dm-bio-prison-v2.c369
-rw-r--r--drivers/md/dm-bio-prison-v2.h152
-rw-r--r--drivers/md/dm-bufio.c121
-rw-r--r--drivers/md/dm-bufio.h7
-rw-r--r--drivers/md/dm-cache-background-tracker.c243
-rw-r--r--drivers/md/dm-cache-background-tracker.h46
-rw-r--r--drivers/md/dm-cache-metadata.c370
-rw-r--r--drivers/md/dm-cache-metadata.h13
-rw-r--r--drivers/md/dm-cache-policy-cleaner.c469
-rw-r--r--drivers/md/dm-cache-policy-internal.h76
-rw-r--r--drivers/md/dm-cache-policy-smq.c818
-rw-r--r--drivers/md/dm-cache-policy.h187
-rw-r--r--drivers/md/dm-cache-target.c2552
-rw-r--r--drivers/md/dm-core.h8
-rw-r--r--drivers/md/dm-crypt.c1306
-rw-r--r--drivers/md/dm-delay.c1
-rw-r--r--drivers/md/dm-era-target.c12
-rw-r--r--drivers/md/dm-flakey.c34
-rw-r--r--drivers/md/dm-integrity.c3232
-rw-r--r--drivers/md/dm-io.c33
-rw-r--r--drivers/md/dm-ioctl.c142
-rw-r--r--drivers/md/dm-kcopyd.c71
-rw-r--r--drivers/md/dm-linear.c71
-rw-r--r--drivers/md/dm-log-writes.c13
-rw-r--r--drivers/md/dm-mpath.c416
-rw-r--r--drivers/md/dm-raid.c490
-rw-r--r--drivers/md/dm-raid1.c51
-rw-r--r--drivers/md/dm-round-robin.c67
-rw-r--r--drivers/md/dm-rq.c355
-rw-r--r--drivers/md/dm-rq.h4
-rw-r--r--drivers/md/dm-snap-persistent.c3
-rw-r--r--drivers/md/dm-snap.c21
-rw-r--r--drivers/md/dm-stats.c8
-rw-r--r--drivers/md/dm-stripe.c85
-rw-r--r--drivers/md/dm-table.c310
-rw-r--r--drivers/md/dm-target.c17
-rw-r--r--drivers/md/dm-thin-metadata.c6
-rw-r--r--drivers/md/dm-thin.c113
-rw-r--r--drivers/md/dm-verity-fec.c22
-rw-r--r--drivers/md/dm-verity-fec.h4
-rw-r--r--drivers/md/dm-verity-target.c219
-rw-r--r--drivers/md/dm-verity.h23
-rw-r--r--drivers/md/dm-zero.c4
-rw-r--r--drivers/md/dm-zoned-metadata.c2509
-rw-r--r--drivers/md/dm-zoned-reclaim.c570
-rw-r--r--drivers/md/dm-zoned-target.c967
-rw-r--r--drivers/md/dm-zoned.h228
-rw-r--r--drivers/md/dm.c458
-rw-r--r--drivers/md/dm.h11
-rw-r--r--drivers/md/faulty.c7
-rw-r--r--drivers/md/linear.c122
-rw-r--r--drivers/md/linear.h1
-rw-r--r--drivers/md/md-cluster.c229
-rw-r--r--drivers/md/md-cluster.h1
-rw-r--r--drivers/md/md.c559
-rw-r--r--drivers/md/md.h107
-rw-r--r--drivers/md/multipath.c22
-rw-r--r--drivers/md/persistent-data/dm-array.c21
-rw-r--r--drivers/md/persistent-data/dm-array.h1
-rw-r--r--drivers/md/persistent-data/dm-bitset.c146
-rw-r--r--drivers/md/persistent-data/dm-bitset.h39
-rw-r--r--drivers/md/persistent-data/dm-block-manager.c14
-rw-r--r--drivers/md/persistent-data/dm-block-manager.h2
-rw-r--r--drivers/md/persistent-data/dm-btree.c26
-rw-r--r--drivers/md/persistent-data/dm-btree.h1
-rw-r--r--drivers/md/persistent-data/dm-space-map-common.c16
-rw-r--r--drivers/md/persistent-data/dm-space-map-disk.c15
-rw-r--r--drivers/md/persistent-data/dm-space-map-metadata.c4
-rw-r--r--drivers/md/raid0.c178
-rw-r--r--drivers/md/raid1.c1185
-rw-r--r--drivers/md/raid1.h71
-rw-r--r--drivers/md/raid10.c778
-rw-r--r--drivers/md/raid10.h1
-rw-r--r--drivers/md/raid5-cache.c706
-rw-r--r--drivers/md/raid5-log.h116
-rw-r--r--drivers/md/raid5-ppl.c1271
-rw-r--r--drivers/md/raid5.c932
-rw-r--r--drivers/md/raid5.h120
100 files changed, 18358 insertions, 5963 deletions
diff --git a/drivers/md/Kconfig b/drivers/md/Kconfig
index b7767da50c26..4a249ee86364 100644
--- a/drivers/md/Kconfig
+++ b/drivers/md/Kconfig
@@ -115,7 +115,7 @@ config MD_RAID10
RAID-10 requires mdadm-1.7.0 or later, available at:
- ftp://ftp.kernel.org/pub/linux/utils/raid/mdadm/
+ https://www.kernel.org/pub/linux/utils/raid/mdadm/
If unsure, say Y.
@@ -200,6 +200,7 @@ config BLK_DEV_DM_BUILTIN
config BLK_DEV_DM
tristate "Device mapper support"
select BLK_DEV_DM_BUILTIN
+ select DAX
---help---
Device-mapper is a low level volume manager. It works by allowing
people to specify mappings for ranges of logical sectors. Various
@@ -325,14 +326,6 @@ config DM_CACHE_SMQ
of less memory utilization, improved performance and increased
adaptability in the face of changing workloads.
-config DM_CACHE_CLEANER
- tristate "Cleaner Cache Policy (EXPERIMENTAL)"
- depends on DM_CACHE
- default y
- ---help---
- A simple cache policy that writes back all data to the
- origin. Used when decommissioning a dm-cache.
-
config DM_ERA
tristate "Era target (EXPERIMENTAL)"
depends on BLK_DEV_DM
@@ -365,6 +358,7 @@ config DM_LOG_USERSPACE
config DM_RAID
tristate "RAID 1/4/5/6/10 target"
depends on BLK_DEV_DM
+ select MD_RAID0
select MD_RAID1
select MD_RAID10
select MD_RAID456
@@ -508,4 +502,42 @@ config DM_LOG_WRITES
If unsure, say N.
+config DM_INTEGRITY
+ tristate "Integrity target support"
+ depends on BLK_DEV_DM
+ select BLK_DEV_INTEGRITY
+ select DM_BUFIO
+ select CRYPTO
+ select ASYNC_XOR
+ ---help---
+ This device-mapper target emulates a block device that has
+ additional per-sector tags that can be used for storing
+ integrity information.
+
+ This integrity target is used with the dm-crypt target to
+ provide authenticated disk encryption or it can be used
+ standalone.
+
+ To compile this code as a module, choose M here: the module will
+ be called dm-integrity.
+
+config DM_ZONED
+ tristate "Drive-managed zoned block device target support"
+ depends on BLK_DEV_DM
+ depends on BLK_DEV_ZONED
+ ---help---
+ This device-mapper target takes a host-managed or host-aware zoned
+ block device and exposes most of its capacity as a regular block
+ device (drive-managed zoned block device) without any write
+ constraints. This is mainly intended for use with file systems that
+ do not natively support zoned block devices but still want to
+ benefit from the increased capacity offered by SMR disks. Other uses
+ by applications using raw block devices (for example object stores)
+ are also possible.
+
+ To compile this code as a module, choose M here: the module will
+ be called dm-zoned.
+
+ If unsure, say N.
+
endif # MD
diff --git a/drivers/md/Makefile b/drivers/md/Makefile
index 3cbda1af87a0..786ec9e86d65 100644
--- a/drivers/md/Makefile
+++ b/drivers/md/Makefile
@@ -11,14 +11,16 @@ dm-snapshot-y += dm-snap.o dm-exception-store.o dm-snap-transient.o \
dm-mirror-y += dm-raid1.o
dm-log-userspace-y \
+= dm-log-userspace-base.o dm-log-userspace-transfer.o
+dm-bio-prison-y += dm-bio-prison-v1.o dm-bio-prison-v2.o
dm-thin-pool-y += dm-thin.o dm-thin-metadata.o
-dm-cache-y += dm-cache-target.o dm-cache-metadata.o dm-cache-policy.o
+dm-cache-y += dm-cache-target.o dm-cache-metadata.o dm-cache-policy.o \
+ dm-cache-background-tracker.o
dm-cache-smq-y += dm-cache-policy-smq.o
-dm-cache-cleaner-y += dm-cache-policy-cleaner.o
dm-era-y += dm-era-target.o
dm-verity-y += dm-verity-target.o
md-mod-y += md.o bitmap.o
-raid456-y += raid5.o raid5-cache.o
+raid456-y += raid5.o raid5-cache.o raid5-ppl.o
+dm-zoned-y += dm-zoned-target.o dm-zoned-metadata.o dm-zoned-reclaim.o
# Note: link order is important. All raid personalities
# and must come before md.o, as they each initialise
@@ -56,9 +58,10 @@ obj-$(CONFIG_DM_THIN_PROVISIONING) += dm-thin-pool.o
obj-$(CONFIG_DM_VERITY) += dm-verity.o
obj-$(CONFIG_DM_CACHE) += dm-cache.o
obj-$(CONFIG_DM_CACHE_SMQ) += dm-cache-smq.o
-obj-$(CONFIG_DM_CACHE_CLEANER) += dm-cache-cleaner.o
obj-$(CONFIG_DM_ERA) += dm-era.o
obj-$(CONFIG_DM_LOG_WRITES) += dm-log-writes.o
+obj-$(CONFIG_DM_INTEGRITY) += dm-integrity.o
+obj-$(CONFIG_DM_ZONED) += dm-zoned.o
ifeq ($(CONFIG_DM_UEVENT),y)
dm-mod-objs += dm-uevent.o
diff --git a/drivers/md/bcache/bcache.h b/drivers/md/bcache/bcache.h
index c3ea03c9a1a8..dee542fff68e 100644
--- a/drivers/md/bcache/bcache.h
+++ b/drivers/md/bcache/bcache.h
@@ -849,10 +849,11 @@ static inline void wake_up_allocators(struct cache_set *c)
/* Forward declarations */
-void bch_count_io_errors(struct cache *, int, const char *);
+void bch_count_io_errors(struct cache *, blk_status_t, const char *);
void bch_bbio_count_io_errors(struct cache_set *, struct bio *,
- int, const char *);
-void bch_bbio_endio(struct cache_set *, struct bio *, int, const char *);
+ blk_status_t, const char *);
+void bch_bbio_endio(struct cache_set *, struct bio *, blk_status_t,
+ const char *);
void bch_bbio_free(struct bio *, struct cache_set *);
struct bio *bch_bbio_alloc(struct cache_set *);
diff --git a/drivers/md/bcache/bset.c b/drivers/md/bcache/bset.c
index 646fe85261c1..18526d44688d 100644
--- a/drivers/md/bcache/bset.c
+++ b/drivers/md/bcache/bset.c
@@ -11,6 +11,7 @@
#include "bset.h"
#include <linux/console.h>
+#include <linux/sched/clock.h>
#include <linux/random.h>
#include <linux/prefetch.h>
diff --git a/drivers/md/bcache/btree.c b/drivers/md/bcache/btree.c
index a43eedd5804d..866dcf78ff8e 100644
--- a/drivers/md/bcache/btree.c
+++ b/drivers/md/bcache/btree.c
@@ -32,6 +32,9 @@
#include <linux/prefetch.h>
#include <linux/random.h>
#include <linux/rcupdate.h>
+#include <linux/sched/clock.h>
+#include <linux/rculist.h>
+
#include <trace/events/bcache.h>
/*
@@ -304,7 +307,7 @@ static void bch_btree_node_read(struct btree *b)
bch_submit_bbio(bio, b->c, &b->key, 0);
closure_sync(&cl);
- if (bio->bi_error)
+ if (bio->bi_status)
set_btree_node_io_error(b);
bch_bbio_free(bio, b->c);
@@ -371,10 +374,10 @@ static void btree_node_write_endio(struct bio *bio)
struct closure *cl = bio->bi_private;
struct btree *b = container_of(cl, struct btree, io);
- if (bio->bi_error)
+ if (bio->bi_status)
set_btree_node_io_error(b);
- bch_bbio_count_io_errors(b->c, bio, bio->bi_error, "writing btree");
+ bch_bbio_count_io_errors(b->c, bio, bio->bi_status, "writing btree");
closure_put(cl);
}
diff --git a/drivers/md/bcache/btree.h b/drivers/md/bcache/btree.h
index 9b80417cd547..73da1f5626cb 100644
--- a/drivers/md/bcache/btree.h
+++ b/drivers/md/bcache/btree.h
@@ -207,7 +207,7 @@ void bkey_put(struct cache_set *c, struct bkey *k);
struct btree_op {
/* for waiting on btree reserve in btree_split() */
- wait_queue_t wait;
+ wait_queue_entry_t wait;
/* Btree level at which we start taking write locks */
short lock;
diff --git a/drivers/md/bcache/closure.h b/drivers/md/bcache/closure.h
index 9b2fe2d3e3a9..1ec84ca81146 100644
--- a/drivers/md/bcache/closure.h
+++ b/drivers/md/bcache/closure.h
@@ -3,6 +3,7 @@
#include <linux/llist.h>
#include <linux/sched.h>
+#include <linux/sched/task_stack.h>
#include <linux/workqueue.h>
/*
diff --git a/drivers/md/bcache/debug.c b/drivers/md/bcache/debug.c
index 06f55056aaae..35a5a7210e51 100644
--- a/drivers/md/bcache/debug.c
+++ b/drivers/md/bcache/debug.c
@@ -110,7 +110,7 @@ void bch_data_verify(struct cached_dev *dc, struct bio *bio)
struct bio_vec bv, cbv;
struct bvec_iter iter, citer = { 0 };
- check = bio_clone(bio, GFP_NOIO);
+ check = bio_clone_kmalloc(bio, GFP_NOIO);
if (!check)
return;
check->bi_opf = REQ_OP_READ;
diff --git a/drivers/md/bcache/io.c b/drivers/md/bcache/io.c
index db45a88c0ce9..6a9b85095e7b 100644
--- a/drivers/md/bcache/io.c
+++ b/drivers/md/bcache/io.c
@@ -50,7 +50,7 @@ void bch_submit_bbio(struct bio *bio, struct cache_set *c,
/* IO errors */
-void bch_count_io_errors(struct cache *ca, int error, const char *m)
+void bch_count_io_errors(struct cache *ca, blk_status_t error, const char *m)
{
/*
* The halflife of an error is:
@@ -103,7 +103,7 @@ void bch_count_io_errors(struct cache *ca, int error, const char *m)
}
void bch_bbio_count_io_errors(struct cache_set *c, struct bio *bio,
- int error, const char *m)
+ blk_status_t error, const char *m)
{
struct bbio *b = container_of(bio, struct bbio, bio);
struct cache *ca = PTR_CACHE(c, &b->key, 0);
@@ -132,7 +132,7 @@ void bch_bbio_count_io_errors(struct cache_set *c, struct bio *bio,
}
void bch_bbio_endio(struct cache_set *c, struct bio *bio,
- int error, const char *m)
+ blk_status_t error, const char *m)
{
struct closure *cl = bio->bi_private;
diff --git a/drivers/md/bcache/journal.c b/drivers/md/bcache/journal.c
index 1198e53d5670..0352d05e495c 100644
--- a/drivers/md/bcache/journal.c
+++ b/drivers/md/bcache/journal.c
@@ -549,7 +549,7 @@ static void journal_write_endio(struct bio *bio)
{
struct journal_write *w = bio->bi_private;
- cache_set_err_on(bio->bi_error, w->c, "journal io error");
+ cache_set_err_on(bio->bi_status, w->c, "journal io error");
closure_put(&w->c->journal.io);
}
diff --git a/drivers/md/bcache/movinggc.c b/drivers/md/bcache/movinggc.c
index 13b8a907006d..f633b30c962e 100644
--- a/drivers/md/bcache/movinggc.c
+++ b/drivers/md/bcache/movinggc.c
@@ -63,14 +63,14 @@ static void read_moving_endio(struct bio *bio)
struct moving_io *io = container_of(bio->bi_private,
struct moving_io, cl);
- if (bio->bi_error)
- io->op.error = bio->bi_error;
+ if (bio->bi_status)
+ io->op.status = bio->bi_status;
else if (!KEY_DIRTY(&b->key) &&
ptr_stale(io->op.c, &b->key, 0)) {
- io->op.error = -EINTR;
+ io->op.status = BLK_STS_IOERR;
}
- bch_bbio_endio(io->op.c, bio, bio->bi_error, "reading data to move");
+ bch_bbio_endio(io->op.c, bio, bio->bi_status, "reading data to move");
}
static void moving_init(struct moving_io *io)
@@ -92,7 +92,7 @@ static void write_moving(struct closure *cl)
struct moving_io *io = container_of(cl, struct moving_io, cl);
struct data_insert_op *op = &io->op;
- if (!op->error) {
+ if (!op->status) {
moving_init(io);
io->bio.bio.bi_iter.bi_sector = KEY_START(&io->w->key);
diff --git a/drivers/md/bcache/request.c b/drivers/md/bcache/request.c
index 76d20875503c..019b3df9f1c6 100644
--- a/drivers/md/bcache/request.c
+++ b/drivers/md/bcache/request.c
@@ -81,7 +81,7 @@ static void bch_data_insert_keys(struct closure *cl)
if (ret == -ESRCH) {
op->replace_collision = true;
} else if (ret) {
- op->error = -ENOMEM;
+ op->status = BLK_STS_RESOURCE;
op->insert_data_done = true;
}
@@ -178,17 +178,17 @@ static void bch_data_insert_endio(struct bio *bio)
struct closure *cl = bio->bi_private;
struct data_insert_op *op = container_of(cl, struct data_insert_op, cl);
- if (bio->bi_error) {
+ if (bio->bi_status) {
/* TODO: We could try to recover from this. */
if (op->writeback)
- op->error = bio->bi_error;
+ op->status = bio->bi_status;
else if (!op->replace)
set_closure_fn(cl, bch_data_insert_error, op->wq);
else
set_closure_fn(cl, NULL, NULL);
}
- bch_bbio_endio(op->c, bio, bio->bi_error, "writing data to cache");
+ bch_bbio_endio(op->c, bio, bio->bi_status, "writing data to cache");
}
static void bch_data_insert_start(struct closure *cl)
@@ -488,15 +488,15 @@ static void bch_cache_read_endio(struct bio *bio)
* from the backing device.
*/
- if (bio->bi_error)
- s->iop.error = bio->bi_error;
+ if (bio->bi_status)
+ s->iop.status = bio->bi_status;
else if (!KEY_DIRTY(&b->key) &&
ptr_stale(s->iop.c, &b->key, 0)) {
atomic_long_inc(&s->iop.c->cache_read_races);
- s->iop.error = -EINTR;
+ s->iop.status = BLK_STS_IOERR;
}
- bch_bbio_endio(s->iop.c, bio, bio->bi_error, "reading from cache");
+ bch_bbio_endio(s->iop.c, bio, bio->bi_status, "reading from cache");
}
/*
@@ -593,9 +593,9 @@ static void request_endio(struct bio *bio)
{
struct closure *cl = bio->bi_private;
- if (bio->bi_error) {
+ if (bio->bi_status) {
struct search *s = container_of(cl, struct search, cl);
- s->iop.error = bio->bi_error;
+ s->iop.status = bio->bi_status;
/* Only cache read errors are recoverable */
s->recoverable = false;
}
@@ -611,7 +611,7 @@ static void bio_complete(struct search *s)
&s->d->disk->part0, s->start_time);
trace_bcache_request_end(s->d, s->orig_bio);
- s->orig_bio->bi_error = s->iop.error;
+ s->orig_bio->bi_status = s->iop.status;
bio_endio(s->orig_bio);
s->orig_bio = NULL;
}
@@ -664,9 +664,9 @@ static inline struct search *search_alloc(struct bio *bio,
s->iop.inode = d->id;
s->iop.write_point = hash_long((unsigned long) current, 16);
s->iop.write_prio = 0;
- s->iop.error = 0;
+ s->iop.status = 0;
s->iop.flags = 0;
- s->iop.flush_journal = (bio->bi_opf & (REQ_PREFLUSH|REQ_FUA)) != 0;
+ s->iop.flush_journal = op_is_flush(bio->bi_opf);
s->iop.wq = bcache_wq;
return s;
@@ -707,7 +707,7 @@ static void cached_dev_read_error(struct closure *cl)
/* Retry from the backing device: */
trace_bcache_read_retry(s->orig_bio);
- s->iop.error = 0;
+ s->iop.status = 0;
do_bio_hook(s, s->orig_bio);
/* XXX: invalidate cache */
@@ -767,7 +767,7 @@ static void cached_dev_read_done_bh(struct closure *cl)
!s->cache_miss, s->iop.bypass);
trace_bcache_read(s->orig_bio, !s->cache_miss, s->iop.bypass);
- if (s->iop.error)
+ if (s->iop.status)
continue_at_nobarrier(cl, cached_dev_read_error, bcache_wq);
else if (s->iop.bio || verify(dc, &s->bio.bio))
continue_at_nobarrier(cl, cached_dev_read_done, bcache_wq);
@@ -1009,7 +1009,7 @@ static int cached_dev_congested(void *data, int bits)
struct request_queue *q = bdev_get_queue(dc->bdev);
int ret = 0;
- if (bdi_congested(&q->backing_dev_info, bits))
+ if (bdi_congested(q->backing_dev_info, bits))
return 1;
if (cached_dev_get(dc)) {
@@ -1018,7 +1018,7 @@ static int cached_dev_congested(void *data, int bits)
for_each_cache(ca, d->c, i) {
q = bdev_get_queue(ca->bdev);
- ret |= bdi_congested(&q->backing_dev_info, bits);
+ ret |= bdi_congested(q->backing_dev_info, bits);
}
cached_dev_put(dc);
@@ -1032,7 +1032,7 @@ void bch_cached_dev_request_init(struct cached_dev *dc)
struct gendisk *g = dc->disk.disk;
g->queue->make_request_fn = cached_dev_make_request;
- g->queue->backing_dev_info.congested_fn = cached_dev_congested;
+ g->queue->backing_dev_info->congested_fn = cached_dev_congested;
dc->disk.cache_miss = cached_dev_cache_miss;
dc->disk.ioctl = cached_dev_ioctl;
}
@@ -1125,7 +1125,7 @@ static int flash_dev_congested(void *data, int bits)
for_each_cache(ca, d->c, i) {
q = bdev_get_queue(ca->bdev);
- ret |= bdi_congested(&q->backing_dev_info, bits);
+ ret |= bdi_congested(q->backing_dev_info, bits);
}
return ret;
@@ -1136,7 +1136,7 @@ void bch_flash_dev_request_init(struct bcache_device *d)
struct gendisk *g = d->disk;
g->queue->make_request_fn = flash_dev_make_request;
- g->queue->backing_dev_info.congested_fn = flash_dev_congested;
+ g->queue->backing_dev_info->congested_fn = flash_dev_congested;
d->cache_miss = flash_dev_cache_miss;
d->ioctl = flash_dev_ioctl;
}
diff --git a/drivers/md/bcache/request.h b/drivers/md/bcache/request.h
index 1ff36875c2b3..7689176951ce 100644
--- a/drivers/md/bcache/request.h
+++ b/drivers/md/bcache/request.h
@@ -10,7 +10,7 @@ struct data_insert_op {
unsigned inode;
uint16_t write_point;
uint16_t write_prio;
- short error;
+ blk_status_t status;
union {
uint16_t flags;
diff --git a/drivers/md/bcache/super.c b/drivers/md/bcache/super.c
index 3a19cbc8b230..8352fad765f6 100644
--- a/drivers/md/bcache/super.c
+++ b/drivers/md/bcache/super.c
@@ -271,7 +271,7 @@ static void write_super_endio(struct bio *bio)
{
struct cache *ca = bio->bi_private;
- bch_count_io_errors(ca, bio->bi_error, "writing superblock");
+ bch_count_io_errors(ca, bio->bi_status, "writing superblock");
closure_put(&ca->set->sb_write);
}
@@ -321,7 +321,7 @@ static void uuid_endio(struct bio *bio)
struct closure *cl = bio->bi_private;
struct cache_set *c = container_of(cl, struct cache_set, uuid_write);
- cache_set_err_on(bio->bi_error, c, "accessing uuids");
+ cache_set_err_on(bio->bi_status, c, "accessing uuids");
bch_bbio_free(bio, c);
closure_put(cl);
}
@@ -494,7 +494,7 @@ static void prio_endio(struct bio *bio)
{
struct cache *ca = bio->bi_private;
- cache_set_err_on(bio->bi_error, ca->set, "accessing priorities");
+ cache_set_err_on(bio->bi_status, ca->set, "accessing priorities");
bch_bbio_free(bio, ca->set);
closure_put(&ca->prio);
}
@@ -767,16 +767,12 @@ static int bcache_device_init(struct bcache_device *d, unsigned block_size,
}
n = d->nr_stripes * sizeof(atomic_t);
- d->stripe_sectors_dirty = n < PAGE_SIZE << 6
- ? kzalloc(n, GFP_KERNEL)
- : vzalloc(n);
+ d->stripe_sectors_dirty = kvzalloc(n, GFP_KERNEL);
if (!d->stripe_sectors_dirty)
return -ENOMEM;
n = BITS_TO_LONGS(d->nr_stripes) * sizeof(unsigned long);
- d->full_dirty_stripes = n < PAGE_SIZE << 6
- ? kzalloc(n, GFP_KERNEL)
- : vzalloc(n);
+ d->full_dirty_stripes = kvzalloc(n, GFP_KERNEL);
if (!d->full_dirty_stripes)
return -ENOMEM;
@@ -786,7 +782,9 @@ static int bcache_device_init(struct bcache_device *d, unsigned block_size,
minor *= BCACHE_MINORS;
- if (!(d->bio_split = bioset_create(4, offsetof(struct bbio, bio))) ||
+ if (!(d->bio_split = bioset_create(4, offsetof(struct bbio, bio),
+ BIOSET_NEED_BVECS |
+ BIOSET_NEED_RESCUER)) ||
!(d->disk = alloc_disk(BCACHE_MINORS))) {
ida_simple_remove(&bcache_minor, minor);
return -ENOMEM;
@@ -807,7 +805,7 @@ static int bcache_device_init(struct bcache_device *d, unsigned block_size,
blk_queue_make_request(q, NULL);
d->disk->queue = q;
q->queuedata = d;
- q->backing_dev_info.congested_data = d;
+ q->backing_dev_info->congested_data = d;
q->limits.max_hw_sectors = UINT_MAX;
q->limits.max_sectors = UINT_MAX;
q->limits.max_segment_size = UINT_MAX;
@@ -1132,9 +1130,9 @@ static int cached_dev_init(struct cached_dev *dc, unsigned block_size)
set_capacity(dc->disk.disk,
dc->bdev->bd_part->nr_sects - dc->sb.data_offset);
- dc->disk.disk->queue->backing_dev_info.ra_pages =
- max(dc->disk.disk->queue->backing_dev_info.ra_pages,
- q->backing_dev_info.ra_pages);
+ dc->disk.disk->queue->backing_dev_info->ra_pages =
+ max(dc->disk.disk->queue->backing_dev_info->ra_pages,
+ q->backing_dev_info->ra_pages);
bch_cached_dev_request_init(dc);
bch_cached_dev_writeback_init(dc);
@@ -1520,7 +1518,9 @@ struct cache_set *bch_cache_set_alloc(struct cache_sb *sb)
sizeof(struct bbio) + sizeof(struct bio_vec) *
bucket_pages(c))) ||
!(c->fill_iter = mempool_create_kmalloc_pool(1, iter_size)) ||
- !(c->bio_split = bioset_create(4, offsetof(struct bbio, bio))) ||
+ !(c->bio_split = bioset_create(4, offsetof(struct bbio, bio),
+ BIOSET_NEED_BVECS |
+ BIOSET_NEED_RESCUER)) ||
!(c->uuids = alloc_bucket_pages(GFP_KERNEL, c)) ||
!(c->moving_gc_wq = alloc_workqueue("bcache_gc",
WQ_MEM_RECLAIM, 0)) ||
diff --git a/drivers/md/bcache/sysfs.c b/drivers/md/bcache/sysfs.c
index b3ff57d61dde..f90f13616980 100644
--- a/drivers/md/bcache/sysfs.c
+++ b/drivers/md/bcache/sysfs.c
@@ -13,6 +13,7 @@
#include <linux/blkdev.h>
#include <linux/sort.h>
+#include <linux/sched/clock.h>
static const char * const cache_replacement_policies[] = {
"lru",
diff --git a/drivers/md/bcache/util.c b/drivers/md/bcache/util.c
index dde6172f3f10..8c3a938f4bf0 100644
--- a/drivers/md/bcache/util.c
+++ b/drivers/md/bcache/util.c
@@ -12,6 +12,7 @@
#include <linux/module.h>
#include <linux/seq_file.h>
#include <linux/types.h>
+#include <linux/sched/clock.h>
#include "util.h"
diff --git a/drivers/md/bcache/util.h b/drivers/md/bcache/util.h
index cf2cbc211d83..cb8d2ccbb6c6 100644
--- a/drivers/md/bcache/util.h
+++ b/drivers/md/bcache/util.h
@@ -4,8 +4,8 @@
#include <linux/blkdev.h>
#include <linux/errno.h>
-#include <linux/blkdev.h>
#include <linux/kernel.h>
+#include <linux/sched/clock.h>
#include <linux/llist.h>
#include <linux/ratelimit.h>
#include <linux/vmalloc.h>
@@ -43,11 +43,7 @@ struct closure;
(heap)->used = 0; \
(heap)->size = (_size); \
_bytes = (heap)->size * sizeof(*(heap)->data); \
- (heap)->data = NULL; \
- if (_bytes < KMALLOC_MAX_SIZE) \
- (heap)->data = kmalloc(_bytes, (gfp)); \
- if ((!(heap)->data) && ((gfp) & GFP_KERNEL)) \
- (heap)->data = vmalloc(_bytes); \
+ (heap)->data = kvmalloc(_bytes, (gfp) & GFP_KERNEL); \
(heap)->data; \
})
@@ -136,12 +132,8 @@ do { \
\
(fifo)->mask = _allocated_size - 1; \
(fifo)->front = (fifo)->back = 0; \
- (fifo)->data = NULL; \
\
- if (_bytes < KMALLOC_MAX_SIZE) \
- (fifo)->data = kmalloc(_bytes, (gfp)); \
- if ((!(fifo)->data) && ((gfp) & GFP_KERNEL)) \
- (fifo)->data = vmalloc(_bytes); \
+ (fifo)->data = kvmalloc(_bytes, (gfp) & GFP_KERNEL); \
(fifo)->data; \
})
diff --git a/drivers/md/bcache/writeback.c b/drivers/md/bcache/writeback.c
index 69e1ae59cab8..42c66e76f05e 100644
--- a/drivers/md/bcache/writeback.c
+++ b/drivers/md/bcache/writeback.c
@@ -13,6 +13,7 @@
#include <linux/delay.h>
#include <linux/kthread.h>
+#include <linux/sched/clock.h>
#include <trace/events/bcache.h>
/* Rate limiting */
@@ -166,7 +167,7 @@ static void dirty_endio(struct bio *bio)
struct keybuf_key *w = bio->bi_private;
struct dirty_io *io = w->private;
- if (bio->bi_error)
+ if (bio->bi_status)
SET_KEY_DIRTY(&w->key, false);
closure_put(&io->cl);
@@ -194,7 +195,7 @@ static void read_dirty_endio(struct bio *bio)
struct dirty_io *io = w->private;
bch_count_io_errors(PTR_CACHE(io->dc->disk.c, &w->key, 0),
- bio->bi_error, "reading dirty data from cache");
+ bio->bi_status, "reading dirty data from cache");
dirty_endio(bio);
}
diff --git a/drivers/md/bitmap.c b/drivers/md/bitmap.c
index 9fb2ccac958a..40f3cd7eab0f 100644
--- a/drivers/md/bitmap.c
+++ b/drivers/md/bitmap.c
@@ -156,7 +156,8 @@ static int read_sb_page(struct mddev *mddev, loff_t offset,
rdev_for_each(rdev, mddev) {
if (! test_bit(In_sync, &rdev->flags)
- || test_bit(Faulty, &rdev->flags))
+ || test_bit(Faulty, &rdev->flags)
+ || test_bit(Bitmap_sync, &rdev->flags))
continue;
target = offset + index * (PAGE_SIZE/512);
@@ -471,6 +472,7 @@ void bitmap_update_sb(struct bitmap *bitmap)
kunmap_atomic(sb);
write_page(bitmap, bitmap->storage.sb_page, 1);
}
+EXPORT_SYMBOL(bitmap_update_sb);
/* print out the bitmap file superblock */
void bitmap_print_sb(struct bitmap *bitmap)
@@ -484,10 +486,10 @@ void bitmap_print_sb(struct bitmap *bitmap)
pr_debug(" magic: %08x\n", le32_to_cpu(sb->magic));
pr_debug(" version: %d\n", le32_to_cpu(sb->version));
pr_debug(" uuid: %08x.%08x.%08x.%08x\n",
- *(__u32 *)(sb->uuid+0),
- *(__u32 *)(sb->uuid+4),
- *(__u32 *)(sb->uuid+8),
- *(__u32 *)(sb->uuid+12));
+ le32_to_cpu(*(__u32 *)(sb->uuid+0)),
+ le32_to_cpu(*(__u32 *)(sb->uuid+4)),
+ le32_to_cpu(*(__u32 *)(sb->uuid+8)),
+ le32_to_cpu(*(__u32 *)(sb->uuid+12)));
pr_debug(" events: %llu\n",
(unsigned long long) le64_to_cpu(sb->events));
pr_debug("events cleared: %llu\n",
@@ -696,7 +698,7 @@ re_read:
out:
kunmap_atomic(sb);
- /* Assiging chunksize is required for "re_read" */
+ /* Assigning chunksize is required for "re_read" */
bitmap->mddev->bitmap_info.chunksize = chunksize;
if (err == 0 && nodes && (bitmap->cluster_slot < 0)) {
err = md_setup_cluster(bitmap->mddev, nodes);
@@ -1727,7 +1729,7 @@ void bitmap_flush(struct mddev *mddev)
/*
* free memory that was allocated
*/
-static void bitmap_free(struct bitmap *bitmap)
+void bitmap_free(struct bitmap *bitmap)
{
unsigned long k, pages;
struct bitmap_page *bp;
@@ -1761,6 +1763,21 @@ static void bitmap_free(struct bitmap *bitmap)
kfree(bp);
kfree(bitmap);
}
+EXPORT_SYMBOL(bitmap_free);
+
+void bitmap_wait_behind_writes(struct mddev *mddev)
+{
+ struct bitmap *bitmap = mddev->bitmap;
+
+ /* wait for behind writes to complete */
+ if (bitmap && atomic_read(&bitmap->behind_writes) > 0) {
+ pr_debug("md:%s: behind writes in progress - waiting to stop.\n",
+ mdname(mddev));
+ /* need to kick something here to make sure I/O goes? */
+ wait_event(bitmap->behind_wait,
+ atomic_read(&bitmap->behind_writes) == 0);
+ }
+}
void bitmap_destroy(struct mddev *mddev)
{
@@ -1769,6 +1786,8 @@ void bitmap_destroy(struct mddev *mddev)
if (!bitmap) /* there was no bitmap */
return;
+ bitmap_wait_behind_writes(mddev);
+
mutex_lock(&mddev->bitmap_info.mutex);
spin_lock(&mddev->lock);
mddev->bitmap = NULL; /* disconnect from the md device */
@@ -1920,6 +1939,27 @@ out:
}
EXPORT_SYMBOL_GPL(bitmap_load);
+struct bitmap *get_bitmap_from_slot(struct mddev *mddev, int slot)
+{
+ int rv = 0;
+ struct bitmap *bitmap;
+
+ bitmap = bitmap_create(mddev, slot);
+ if (IS_ERR(bitmap)) {
+ rv = PTR_ERR(bitmap);
+ return ERR_PTR(rv);
+ }
+
+ rv = bitmap_init_from_disk(bitmap, 0);
+ if (rv) {
+ bitmap_free(bitmap);
+ return ERR_PTR(rv);
+ }
+
+ return bitmap;
+}
+EXPORT_SYMBOL(get_bitmap_from_slot);
+
/* Loads the bitmap associated with slot and copies the resync information
* to our bitmap
*/
@@ -1929,14 +1969,13 @@ int bitmap_copy_from_slot(struct mddev *mddev, int slot,
int rv = 0, i, j;
sector_t block, lo = 0, hi = 0;
struct bitmap_counts *counts;
- struct bitmap *bitmap = bitmap_create(mddev, slot);
-
- if (IS_ERR(bitmap))
- return PTR_ERR(bitmap);
+ struct bitmap *bitmap;
- rv = bitmap_init_from_disk(bitmap, 0);
- if (rv)
- goto err;
+ bitmap = get_bitmap_from_slot(mddev, slot);
+ if (IS_ERR(bitmap)) {
+ pr_err("%s can't get bitmap from slot %d\n", __func__, slot);
+ return -1;
+ }
counts = &bitmap->counts;
for (j = 0; j < counts->chunks; j++) {
@@ -1963,8 +2002,7 @@ int bitmap_copy_from_slot(struct mddev *mddev, int slot,
bitmap_unplug(mddev->bitmap);
*low = lo;
*high = hi;
-err:
- bitmap_free(bitmap);
+
return rv;
}
EXPORT_SYMBOL_GPL(bitmap_copy_from_slot);
diff --git a/drivers/md/bitmap.h b/drivers/md/bitmap.h
index 5b6dd63dda91..d15721ac07a6 100644
--- a/drivers/md/bitmap.h
+++ b/drivers/md/bitmap.h
@@ -267,8 +267,11 @@ void bitmap_daemon_work(struct mddev *mddev);
int bitmap_resize(struct bitmap *bitmap, sector_t blocks,
int chunksize, int init);
+struct bitmap *get_bitmap_from_slot(struct mddev *mddev, int slot);
int bitmap_copy_from_slot(struct mddev *mddev, int slot,
sector_t *lo, sector_t *hi, bool clear_bits);
+void bitmap_free(struct bitmap *bitmap);
+void bitmap_wait_behind_writes(struct mddev *mddev);
#endif
#endif
diff --git a/drivers/md/dm-bio-prison.c b/drivers/md/dm-bio-prison-v1.c
index 03af174485d3..874841f0fc83 100644
--- a/drivers/md/dm-bio-prison.c
+++ b/drivers/md/dm-bio-prison-v1.c
@@ -5,7 +5,8 @@
*/
#include "dm.h"
-#include "dm-bio-prison.h"
+#include "dm-bio-prison-v1.h"
+#include "dm-bio-prison-v2.h"
#include <linux/spinlock.h>
#include <linux/mempool.h>
@@ -115,7 +116,7 @@ static int __bio_detain(struct dm_bio_prison *prison,
while (*new) {
struct dm_bio_prison_cell *cell =
- container_of(*new, struct dm_bio_prison_cell, node);
+ rb_entry(*new, struct dm_bio_prison_cell, node);
r = cmp_keys(key, &cell->key);
@@ -228,7 +229,7 @@ void dm_cell_release_no_holder(struct dm_bio_prison *prison,
EXPORT_SYMBOL_GPL(dm_cell_release_no_holder);
void dm_cell_error(struct dm_bio_prison *prison,
- struct dm_bio_prison_cell *cell, int error)
+ struct dm_bio_prison_cell *cell, blk_status_t error)
{
struct bio_list bios;
struct bio *bio;
@@ -237,7 +238,7 @@ void dm_cell_error(struct dm_bio_prison *prison,
dm_cell_release(prison, cell, &bios);
while ((bio = bio_list_pop(&bios))) {
- bio->bi_error = error;
+ bio->bi_status = error;
bio_endio(bio);
}
}
@@ -398,7 +399,7 @@ EXPORT_SYMBOL_GPL(dm_deferred_set_add_work);
/*----------------------------------------------------------------*/
-static int __init dm_bio_prison_init(void)
+static int __init dm_bio_prison_init_v1(void)
{
_cell_cache = KMEM_CACHE(dm_bio_prison_cell, 0);
if (!_cell_cache)
@@ -407,12 +408,51 @@ static int __init dm_bio_prison_init(void)
return 0;
}
-static void __exit dm_bio_prison_exit(void)
+static void dm_bio_prison_exit_v1(void)
{
kmem_cache_destroy(_cell_cache);
_cell_cache = NULL;
}
+static int (*_inits[])(void) __initdata = {
+ dm_bio_prison_init_v1,
+ dm_bio_prison_init_v2,
+};
+
+static void (*_exits[])(void) = {
+ dm_bio_prison_exit_v1,
+ dm_bio_prison_exit_v2,
+};
+
+static int __init dm_bio_prison_init(void)
+{
+ const int count = ARRAY_SIZE(_inits);
+
+ int r, i;
+
+ for (i = 0; i < count; i++) {
+ r = _inits[i]();
+ if (r)
+ goto bad;
+ }
+
+ return 0;
+
+ bad:
+ while (i--)
+ _exits[i]();
+
+ return r;
+}
+
+static void __exit dm_bio_prison_exit(void)
+{
+ int i = ARRAY_SIZE(_exits);
+
+ while (i--)
+ _exits[i]();
+}
+
/*
* module hooks
*/
diff --git a/drivers/md/dm-bio-prison.h b/drivers/md/dm-bio-prison-v1.h
index 54352f009bfd..cec52ac5e1ae 100644
--- a/drivers/md/dm-bio-prison.h
+++ b/drivers/md/dm-bio-prison-v1.h
@@ -1,5 +1,5 @@
/*
- * Copyright (C) 2011-2012 Red Hat, Inc.
+ * Copyright (C) 2011-2017 Red Hat, Inc.
*
* This file is released under the GPL.
*/
@@ -91,7 +91,7 @@ void dm_cell_release_no_holder(struct dm_bio_prison *prison,
struct dm_bio_prison_cell *cell,
struct bio_list *inmates);
void dm_cell_error(struct dm_bio_prison *prison,
- struct dm_bio_prison_cell *cell, int error);
+ struct dm_bio_prison_cell *cell, blk_status_t error);
/*
* Visits the cell and then releases. Guarantees no new inmates are
diff --git a/drivers/md/dm-bio-prison-v2.c b/drivers/md/dm-bio-prison-v2.c
new file mode 100644
index 000000000000..8ce3a1a588cf
--- /dev/null
+++ b/drivers/md/dm-bio-prison-v2.c
@@ -0,0 +1,369 @@
+/*
+ * Copyright (C) 2012-2017 Red Hat, Inc.
+ *
+ * This file is released under the GPL.
+ */
+
+#include "dm.h"
+#include "dm-bio-prison-v2.h"
+
+#include <linux/spinlock.h>
+#include <linux/mempool.h>
+#include <linux/module.h>
+#include <linux/slab.h>
+#include <linux/rwsem.h>
+
+/*----------------------------------------------------------------*/
+
+#define MIN_CELLS 1024
+
+struct dm_bio_prison_v2 {
+ struct workqueue_struct *wq;
+
+ spinlock_t lock;
+ mempool_t *cell_pool;
+ struct rb_root cells;
+};
+
+static struct kmem_cache *_cell_cache;
+
+/*----------------------------------------------------------------*/
+
+/*
+ * @nr_cells should be the number of cells you want in use _concurrently_.
+ * Don't confuse it with the number of distinct keys.
+ */
+struct dm_bio_prison_v2 *dm_bio_prison_create_v2(struct workqueue_struct *wq)
+{
+ struct dm_bio_prison_v2 *prison = kmalloc(sizeof(*prison), GFP_KERNEL);
+
+ if (!prison)
+ return NULL;
+
+ prison->wq = wq;
+ spin_lock_init(&prison->lock);
+
+ prison->cell_pool = mempool_create_slab_pool(MIN_CELLS, _cell_cache);
+ if (!prison->cell_pool) {
+ kfree(prison);
+ return NULL;
+ }
+
+ prison->cells = RB_ROOT;
+
+ return prison;
+}
+EXPORT_SYMBOL_GPL(dm_bio_prison_create_v2);
+
+void dm_bio_prison_destroy_v2(struct dm_bio_prison_v2 *prison)
+{
+ mempool_destroy(prison->cell_pool);
+ kfree(prison);
+}
+EXPORT_SYMBOL_GPL(dm_bio_prison_destroy_v2);
+
+struct dm_bio_prison_cell_v2 *dm_bio_prison_alloc_cell_v2(struct dm_bio_prison_v2 *prison, gfp_t gfp)
+{
+ return mempool_alloc(prison->cell_pool, gfp);
+}
+EXPORT_SYMBOL_GPL(dm_bio_prison_alloc_cell_v2);
+
+void dm_bio_prison_free_cell_v2(struct dm_bio_prison_v2 *prison,
+ struct dm_bio_prison_cell_v2 *cell)
+{
+ mempool_free(cell, prison->cell_pool);
+}
+EXPORT_SYMBOL_GPL(dm_bio_prison_free_cell_v2);
+
+static void __setup_new_cell(struct dm_cell_key_v2 *key,
+ struct dm_bio_prison_cell_v2 *cell)
+{
+ memset(cell, 0, sizeof(*cell));
+ memcpy(&cell->key, key, sizeof(cell->key));
+ bio_list_init(&cell->bios);
+}
+
+static int cmp_keys(struct dm_cell_key_v2 *lhs,
+ struct dm_cell_key_v2 *rhs)
+{
+ if (lhs->virtual < rhs->virtual)
+ return -1;
+
+ if (lhs->virtual > rhs->virtual)
+ return 1;
+
+ if (lhs->dev < rhs->dev)
+ return -1;
+
+ if (lhs->dev > rhs->dev)
+ return 1;
+
+ if (lhs->block_end <= rhs->block_begin)
+ return -1;
+
+ if (lhs->block_begin >= rhs->block_end)
+ return 1;
+
+ return 0;
+}
+
+/*
+ * Returns true if node found, otherwise it inserts a new one.
+ */
+static bool __find_or_insert(struct dm_bio_prison_v2 *prison,
+ struct dm_cell_key_v2 *key,
+ struct dm_bio_prison_cell_v2 *cell_prealloc,
+ struct dm_bio_prison_cell_v2 **result)
+{
+ int r;
+ struct rb_node **new = &prison->cells.rb_node, *parent = NULL;
+
+ while (*new) {
+ struct dm_bio_prison_cell_v2 *cell =
+ rb_entry(*new, struct dm_bio_prison_cell_v2, node);
+
+ r = cmp_keys(key, &cell->key);
+
+ parent = *new;
+ if (r < 0)
+ new = &((*new)->rb_left);
+
+ else if (r > 0)
+ new = &((*new)->rb_right);
+
+ else {
+ *result = cell;
+ return true;
+ }
+ }
+
+ __setup_new_cell(key, cell_prealloc);
+ *result = cell_prealloc;
+ rb_link_node(&cell_prealloc->node, parent, new);
+ rb_insert_color(&cell_prealloc->node, &prison->cells);
+
+ return false;
+}
+
+static bool __get(struct dm_bio_prison_v2 *prison,
+ struct dm_cell_key_v2 *key,
+ unsigned lock_level,
+ struct bio *inmate,
+ struct dm_bio_prison_cell_v2 *cell_prealloc,
+ struct dm_bio_prison_cell_v2 **cell)
+{
+ if (__find_or_insert(prison, key, cell_prealloc, cell)) {
+ if ((*cell)->exclusive_lock) {
+ if (lock_level <= (*cell)->exclusive_level) {
+ bio_list_add(&(*cell)->bios, inmate);
+ return false;
+ }
+ }
+
+ (*cell)->shared_count++;
+
+ } else
+ (*cell)->shared_count = 1;
+
+ return true;
+}
+
+bool dm_cell_get_v2(struct dm_bio_prison_v2 *prison,
+ struct dm_cell_key_v2 *key,
+ unsigned lock_level,
+ struct bio *inmate,
+ struct dm_bio_prison_cell_v2 *cell_prealloc,
+ struct dm_bio_prison_cell_v2 **cell_result)
+{
+ int r;
+ unsigned long flags;
+
+ spin_lock_irqsave(&prison->lock, flags);
+ r = __get(prison, key, lock_level, inmate, cell_prealloc, cell_result);
+ spin_unlock_irqrestore(&prison->lock, flags);
+
+ return r;
+}
+EXPORT_SYMBOL_GPL(dm_cell_get_v2);
+
+static bool __put(struct dm_bio_prison_v2 *prison,
+ struct dm_bio_prison_cell_v2 *cell)
+{
+ BUG_ON(!cell->shared_count);
+ cell->shared_count--;
+
+ // FIXME: shared locks granted above the lock level could starve this
+ if (!cell->shared_count) {
+ if (cell->exclusive_lock){
+ if (cell->quiesce_continuation) {
+ queue_work(prison->wq, cell->quiesce_continuation);
+ cell->quiesce_continuation = NULL;
+ }
+ } else {
+ rb_erase(&cell->node, &prison->cells);
+ return true;
+ }
+ }
+
+ return false;
+}
+
+bool dm_cell_put_v2(struct dm_bio_prison_v2 *prison,
+ struct dm_bio_prison_cell_v2 *cell)
+{
+ bool r;
+ unsigned long flags;
+
+ spin_lock_irqsave(&prison->lock, flags);
+ r = __put(prison, cell);
+ spin_unlock_irqrestore(&prison->lock, flags);
+
+ return r;
+}
+EXPORT_SYMBOL_GPL(dm_cell_put_v2);
+
+static int __lock(struct dm_bio_prison_v2 *prison,
+ struct dm_cell_key_v2 *key,
+ unsigned lock_level,
+ struct dm_bio_prison_cell_v2 *cell_prealloc,
+ struct dm_bio_prison_cell_v2 **cell_result)
+{
+ struct dm_bio_prison_cell_v2 *cell;
+
+ if (__find_or_insert(prison, key, cell_prealloc, &cell)) {
+ if (cell->exclusive_lock)
+ return -EBUSY;
+
+ cell->exclusive_lock = true;
+ cell->exclusive_level = lock_level;
+ *cell_result = cell;
+
+ // FIXME: we don't yet know what level these shared locks
+ // were taken at, so have to quiesce them all.
+ return cell->shared_count > 0;
+
+ } else {
+ cell = cell_prealloc;
+ cell->shared_count = 0;
+ cell->exclusive_lock = true;
+ cell->exclusive_level = lock_level;
+ *cell_result = cell;
+ }
+
+ return 0;
+}
+
+int dm_cell_lock_v2(struct dm_bio_prison_v2 *prison,
+ struct dm_cell_key_v2 *key,
+ unsigned lock_level,
+ struct dm_bio_prison_cell_v2 *cell_prealloc,
+ struct dm_bio_prison_cell_v2 **cell_result)
+{
+ int r;
+ unsigned long flags;
+
+ spin_lock_irqsave(&prison->lock, flags);
+ r = __lock(prison, key, lock_level, cell_prealloc, cell_result);
+ spin_unlock_irqrestore(&prison->lock, flags);
+
+ return r;
+}
+EXPORT_SYMBOL_GPL(dm_cell_lock_v2);
+
+static void __quiesce(struct dm_bio_prison_v2 *prison,
+ struct dm_bio_prison_cell_v2 *cell,
+ struct work_struct *continuation)
+{
+ if (!cell->shared_count)
+ queue_work(prison->wq, continuation);
+ else
+ cell->quiesce_continuation = continuation;
+}
+
+void dm_cell_quiesce_v2(struct dm_bio_prison_v2 *prison,
+ struct dm_bio_prison_cell_v2 *cell,
+ struct work_struct *continuation)
+{
+ unsigned long flags;
+
+ spin_lock_irqsave(&prison->lock, flags);
+ __quiesce(prison, cell, continuation);
+ spin_unlock_irqrestore(&prison->lock, flags);
+}
+EXPORT_SYMBOL_GPL(dm_cell_quiesce_v2);
+
+static int __promote(struct dm_bio_prison_v2 *prison,
+ struct dm_bio_prison_cell_v2 *cell,
+ unsigned new_lock_level)
+{
+ if (!cell->exclusive_lock)
+ return -EINVAL;
+
+ cell->exclusive_level = new_lock_level;
+ return cell->shared_count > 0;
+}
+
+int dm_cell_lock_promote_v2(struct dm_bio_prison_v2 *prison,
+ struct dm_bio_prison_cell_v2 *cell,
+ unsigned new_lock_level)
+{
+ int r;
+ unsigned long flags;
+
+ spin_lock_irqsave(&prison->lock, flags);
+ r = __promote(prison, cell, new_lock_level);
+ spin_unlock_irqrestore(&prison->lock, flags);
+
+ return r;
+}
+EXPORT_SYMBOL_GPL(dm_cell_lock_promote_v2);
+
+static bool __unlock(struct dm_bio_prison_v2 *prison,
+ struct dm_bio_prison_cell_v2 *cell,
+ struct bio_list *bios)
+{
+ BUG_ON(!cell->exclusive_lock);
+
+ bio_list_merge(bios, &cell->bios);
+ bio_list_init(&cell->bios);
+
+ if (cell->shared_count) {
+ cell->exclusive_lock = 0;
+ return false;
+ }
+
+ rb_erase(&cell->node, &prison->cells);
+ return true;
+}
+
+bool dm_cell_unlock_v2(struct dm_bio_prison_v2 *prison,
+ struct dm_bio_prison_cell_v2 *cell,
+ struct bio_list *bios)
+{
+ bool r;
+ unsigned long flags;
+
+ spin_lock_irqsave(&prison->lock, flags);
+ r = __unlock(prison, cell, bios);
+ spin_unlock_irqrestore(&prison->lock, flags);
+
+ return r;
+}
+EXPORT_SYMBOL_GPL(dm_cell_unlock_v2);
+
+/*----------------------------------------------------------------*/
+
+int __init dm_bio_prison_init_v2(void)
+{
+ _cell_cache = KMEM_CACHE(dm_bio_prison_cell_v2, 0);
+ if (!_cell_cache)
+ return -ENOMEM;
+
+ return 0;
+}
+
+void dm_bio_prison_exit_v2(void)
+{
+ kmem_cache_destroy(_cell_cache);
+ _cell_cache = NULL;
+}
diff --git a/drivers/md/dm-bio-prison-v2.h b/drivers/md/dm-bio-prison-v2.h
new file mode 100644
index 000000000000..6e04234268db
--- /dev/null
+++ b/drivers/md/dm-bio-prison-v2.h
@@ -0,0 +1,152 @@
+/*
+ * Copyright (C) 2011-2017 Red Hat, Inc.
+ *
+ * This file is released under the GPL.
+ */
+
+#ifndef DM_BIO_PRISON_V2_H
+#define DM_BIO_PRISON_V2_H
+
+#include "persistent-data/dm-block-manager.h" /* FIXME: for dm_block_t */
+#include "dm-thin-metadata.h" /* FIXME: for dm_thin_id */
+
+#include <linux/bio.h>
+#include <linux/rbtree.h>
+#include <linux/workqueue.h>
+
+/*----------------------------------------------------------------*/
+
+int dm_bio_prison_init_v2(void);
+void dm_bio_prison_exit_v2(void);
+
+/*
+ * Sometimes we can't deal with a bio straight away. We put them in prison
+ * where they can't cause any mischief. Bios are put in a cell identified
+ * by a key, multiple bios can be in the same cell. When the cell is
+ * subsequently unlocked the bios become available.
+ */
+struct dm_bio_prison_v2;
+
+/*
+ * Keys define a range of blocks within either a virtual or physical
+ * device.
+ */
+struct dm_cell_key_v2 {
+ int virtual;
+ dm_thin_id dev;
+ dm_block_t block_begin, block_end;
+};
+
+/*
+ * Treat this as opaque, only in header so callers can manage allocation
+ * themselves.
+ */
+struct dm_bio_prison_cell_v2 {
+ // FIXME: pack these
+ bool exclusive_lock;
+ unsigned exclusive_level;
+ unsigned shared_count;
+ struct work_struct *quiesce_continuation;
+
+ struct rb_node node;
+ struct dm_cell_key_v2 key;
+ struct bio_list bios;
+};
+
+struct dm_bio_prison_v2 *dm_bio_prison_create_v2(struct workqueue_struct *wq);
+void dm_bio_prison_destroy_v2(struct dm_bio_prison_v2 *prison);
+
+/*
+ * These two functions just wrap a mempool. This is a transitory step:
+ * Eventually all bio prison clients should manage their own cell memory.
+ *
+ * Like mempool_alloc(), dm_bio_prison_alloc_cell_v2() can only fail if called
+ * in interrupt context or passed GFP_NOWAIT.
+ */
+struct dm_bio_prison_cell_v2 *dm_bio_prison_alloc_cell_v2(struct dm_bio_prison_v2 *prison,
+ gfp_t gfp);
+void dm_bio_prison_free_cell_v2(struct dm_bio_prison_v2 *prison,
+ struct dm_bio_prison_cell_v2 *cell);
+
+/*
+ * Shared locks have a bio associated with them.
+ *
+ * If the lock is granted the caller can continue to use the bio, and must
+ * call dm_cell_put_v2() to drop the reference count when finished using it.
+ *
+ * If the lock cannot be granted then the bio will be tracked within the
+ * cell, and later given to the holder of the exclusive lock.
+ *
+ * See dm_cell_lock_v2() for discussion of the lock_level parameter.
+ *
+ * Compare *cell_result with cell_prealloc to see if the prealloc was used.
+ * If cell_prealloc was used then inmate wasn't added to it.
+ *
+ * Returns true if the lock is granted.
+ */
+bool dm_cell_get_v2(struct dm_bio_prison_v2 *prison,
+ struct dm_cell_key_v2 *key,
+ unsigned lock_level,
+ struct bio *inmate,
+ struct dm_bio_prison_cell_v2 *cell_prealloc,
+ struct dm_bio_prison_cell_v2 **cell_result);
+
+/*
+ * Decrement the shared reference count for the lock. Returns true if
+ * returning ownership of the cell (ie. you should free it).
+ */
+bool dm_cell_put_v2(struct dm_bio_prison_v2 *prison,
+ struct dm_bio_prison_cell_v2 *cell);
+
+/*
+ * Locks a cell. No associated bio. Exclusive locks get priority. These
+ * locks constrain whether the io locks are granted according to level.
+ *
+ * Shared locks will still be granted if the lock_level is > (not = to) the
+ * exclusive lock level.
+ *
+ * If an _exclusive_ lock is already held then -EBUSY is returned.
+ *
+ * Return values:
+ * < 0 - error
+ * 0 - locked; no quiescing needed
+ * 1 - locked; quiescing needed
+ */
+int dm_cell_lock_v2(struct dm_bio_prison_v2 *prison,
+ struct dm_cell_key_v2 *key,
+ unsigned lock_level,
+ struct dm_bio_prison_cell_v2 *cell_prealloc,
+ struct dm_bio_prison_cell_v2 **cell_result);
+
+void dm_cell_quiesce_v2(struct dm_bio_prison_v2 *prison,
+ struct dm_bio_prison_cell_v2 *cell,
+ struct work_struct *continuation);
+
+/*
+ * Promotes an _exclusive_ lock to a higher lock level.
+ *
+ * Return values:
+ * < 0 - error
+ * 0 - promoted; no quiescing needed
+ * 1 - promoted; quiescing needed
+ */
+int dm_cell_lock_promote_v2(struct dm_bio_prison_v2 *prison,
+ struct dm_bio_prison_cell_v2 *cell,
+ unsigned new_lock_level);
+
+/*
+ * Adds any held bios to the bio list.
+ *
+ * There may be shared locks still held at this point even if you quiesced
+ * (ie. different lock levels).
+ *
+ * Returns true if returning ownership of the cell (ie. you should free
+ * it).
+ */
+bool dm_cell_unlock_v2(struct dm_bio_prison_v2 *prison,
+ struct dm_bio_prison_cell_v2 *cell,
+ struct bio_list *bios);
+
+/*----------------------------------------------------------------*/
+
+#endif
diff --git a/drivers/md/dm-bufio.c b/drivers/md/dm-bufio.c
index 84d2f0e4c754..850ff6c67994 100644
--- a/drivers/md/dm-bufio.c
+++ b/drivers/md/dm-bufio.c
@@ -11,6 +11,7 @@
#include <linux/device-mapper.h>
#include <linux/dm-io.h>
#include <linux/slab.h>
+#include <linux/sched/mm.h>
#include <linux/jiffies.h>
#include <linux/vmalloc.h>
#include <linux/shrinker.h>
@@ -109,6 +110,8 @@ struct dm_bufio_client {
struct rb_root buffer_tree;
wait_queue_head_t free_buffer_wait;
+ sector_t start;
+
int async_write_error;
struct list_head client_list;
@@ -142,8 +145,8 @@ struct dm_buffer {
enum data_mode data_mode;
unsigned char list_mode; /* LIST_* */
unsigned hold_count;
- int read_error;
- int write_error;
+ blk_status_t read_error;
+ blk_status_t write_error;
unsigned long state;
unsigned long last_accessed;
struct dm_bufio_client *c;
@@ -215,7 +218,7 @@ static DEFINE_SPINLOCK(param_spinlock);
* Buffers are freed after this timeout
*/
static unsigned dm_bufio_max_age = DM_BUFIO_DEFAULT_AGE_SECS;
-static unsigned dm_bufio_retain_bytes = DM_BUFIO_DEFAULT_RETAIN_BYTES;
+static unsigned long dm_bufio_retain_bytes = DM_BUFIO_DEFAULT_RETAIN_BYTES;
static unsigned long dm_bufio_peak_allocated;
static unsigned long dm_bufio_allocated_kmem_cache;
@@ -403,7 +406,7 @@ static void *alloc_buffer_data(struct dm_bufio_client *c, gfp_t gfp_mask,
if (gfp_mask & __GFP_NORETRY)
noio_flag = memalloc_noio_save();
- ptr = __vmalloc(c->block_size, gfp_mask | __GFP_HIGHMEM, PAGE_KERNEL);
+ ptr = __vmalloc(c->block_size, gfp_mask, PAGE_KERNEL);
if (gfp_mask & __GFP_NORETRY)
memalloc_noio_restore(noio_flag);
@@ -552,12 +555,12 @@ static void dmio_complete(unsigned long error, void *context)
{
struct dm_buffer *b = context;
- b->bio.bi_error = error ? -EIO : 0;
+ b->bio.bi_status = error ? BLK_STS_IOERR : 0;
b->bio.bi_end_io(&b->bio);
}
-static void use_dmio(struct dm_buffer *b, int rw, sector_t block,
- bio_end_io_t *end_io)
+static void use_dmio(struct dm_buffer *b, int rw, sector_t sector,
+ unsigned n_sectors, bio_end_io_t *end_io)
{
int r;
struct dm_io_request io_req = {
@@ -569,8 +572,8 @@ static void use_dmio(struct dm_buffer *b, int rw, sector_t block,
};
struct dm_io_region region = {
.bdev = b->c->bdev,
- .sector = block << b->c->sectors_per_block_bits,
- .count = b->c->block_size >> SECTOR_SHIFT,
+ .sector = sector,
+ .count = n_sectors,
};
if (b->data_mode != DATA_MODE_VMALLOC) {
@@ -585,7 +588,7 @@ static void use_dmio(struct dm_buffer *b, int rw, sector_t block,
r = dm_io(&io_req, 1, &region, NULL);
if (r) {
- b->bio.bi_error = r;
+ b->bio.bi_status = errno_to_blk_status(r);
end_io(&b->bio);
}
}
@@ -593,7 +596,7 @@ static void use_dmio(struct dm_buffer *b, int rw, sector_t block,
static void inline_endio(struct bio *bio)
{
bio_end_io_t *end_fn = bio->bi_private;
- int error = bio->bi_error;
+ blk_status_t status = bio->bi_status;
/*
* Reset the bio to free any attached resources
@@ -601,18 +604,18 @@ static void inline_endio(struct bio *bio)
*/
bio_reset(bio);
- bio->bi_error = error;
+ bio->bi_status = status;
end_fn(bio);
}
-static void use_inline_bio(struct dm_buffer *b, int rw, sector_t block,
- bio_end_io_t *end_io)
+static void use_inline_bio(struct dm_buffer *b, int rw, sector_t sector,
+ unsigned n_sectors, bio_end_io_t *end_io)
{
char *ptr;
int len;
bio_init(&b->bio, b->bio_vec, DM_BUFIO_INLINE_VECS);
- b->bio.bi_iter.bi_sector = block << b->c->sectors_per_block_bits;
+ b->bio.bi_iter.bi_sector = sector;
b->bio.bi_bdev = b->c->bdev;
b->bio.bi_end_io = inline_endio;
/*
@@ -627,7 +630,7 @@ static void use_inline_bio(struct dm_buffer *b, int rw, sector_t block,
* If len < PAGE_SIZE the buffer doesn't cross page boundary.
*/
ptr = b->data;
- len = b->c->block_size;
+ len = n_sectors << SECTOR_SHIFT;
if (len >= PAGE_SIZE)
BUG_ON((unsigned long)ptr & (PAGE_SIZE - 1));
@@ -639,7 +642,7 @@ static void use_inline_bio(struct dm_buffer *b, int rw, sector_t block,
len < PAGE_SIZE ? len : PAGE_SIZE,
offset_in_page(ptr))) {
BUG_ON(b->c->block_size <= PAGE_SIZE);
- use_dmio(b, rw, block, end_io);
+ use_dmio(b, rw, sector, n_sectors, end_io);
return;
}
@@ -650,17 +653,22 @@ static void use_inline_bio(struct dm_buffer *b, int rw, sector_t block,
submit_bio(&b->bio);
}
-static void submit_io(struct dm_buffer *b, int rw, sector_t block,
- bio_end_io_t *end_io)
+static void submit_io(struct dm_buffer *b, int rw, bio_end_io_t *end_io)
{
+ unsigned n_sectors;
+ sector_t sector;
+
if (rw == WRITE && b->c->write_callback)
b->c->write_callback(b);
- if (b->c->block_size <= DM_BUFIO_INLINE_VECS * PAGE_SIZE &&
+ sector = (b->block << b->c->sectors_per_block_bits) + b->c->start;
+ n_sectors = 1 << b->c->sectors_per_block_bits;
+
+ if (n_sectors <= ((DM_BUFIO_INLINE_VECS * PAGE_SIZE) >> SECTOR_SHIFT) &&
b->data_mode != DATA_MODE_VMALLOC)
- use_inline_bio(b, rw, block, end_io);
+ use_inline_bio(b, rw, sector, n_sectors, end_io);
else
- use_dmio(b, rw, block, end_io);
+ use_dmio(b, rw, sector, n_sectors, end_io);
}
/*----------------------------------------------------------------
@@ -677,11 +685,12 @@ static void write_endio(struct bio *bio)
{
struct dm_buffer *b = container_of(bio, struct dm_buffer, bio);
- b->write_error = bio->bi_error;
- if (unlikely(bio->bi_error)) {
+ b->write_error = bio->bi_status;
+ if (unlikely(bio->bi_status)) {
struct dm_bufio_client *c = b->c;
- int error = bio->bi_error;
- (void)cmpxchg(&c->async_write_error, 0, error);
+
+ (void)cmpxchg(&c->async_write_error, 0,
+ blk_status_to_errno(bio->bi_status));
}
BUG_ON(!test_bit(B_WRITING, &b->state));
@@ -712,7 +721,7 @@ static void __write_dirty_buffer(struct dm_buffer *b,
wait_on_bit_lock_io(&b->state, B_WRITING, TASK_UNINTERRUPTIBLE);
if (!write_list)
- submit_io(b, WRITE, b->block, write_endio);
+ submit_io(b, WRITE, write_endio);
else
list_add_tail(&b->write_list, write_list);
}
@@ -725,7 +734,7 @@ static void __flush_write_list(struct list_head *write_list)
struct dm_buffer *b =
list_entry(write_list->next, struct dm_buffer, write_list);
list_del(&b->write_list);
- submit_io(b, WRITE, b->block, write_endio);
+ submit_io(b, WRITE, write_endio);
cond_resched();
}
blk_finish_plug(&plug);
@@ -794,7 +803,7 @@ static void __wait_for_free_buffer(struct dm_bufio_client *c)
DECLARE_WAITQUEUE(wait, current);
add_wait_queue(&c->free_buffer_wait, &wait);
- set_task_state(current, TASK_UNINTERRUPTIBLE);
+ set_current_state(TASK_UNINTERRUPTIBLE);
dm_bufio_unlock(c);
io_schedule();
@@ -932,10 +941,11 @@ static void __get_memory_limit(struct dm_bufio_client *c,
{
unsigned long buffers;
- if (ACCESS_ONCE(dm_bufio_cache_size) != dm_bufio_cache_size_latch) {
- mutex_lock(&dm_bufio_clients_lock);
- __cache_size_refresh();
- mutex_unlock(&dm_bufio_clients_lock);
+ if (unlikely(ACCESS_ONCE(dm_bufio_cache_size) != dm_bufio_cache_size_latch)) {
+ if (mutex_trylock(&dm_bufio_clients_lock)) {
+ __cache_size_refresh();
+ mutex_unlock(&dm_bufio_clients_lock);
+ }
}
buffers = dm_bufio_cache_size_per_client >>
@@ -1054,7 +1064,7 @@ static void read_endio(struct bio *bio)
{
struct dm_buffer *b = container_of(bio, struct dm_buffer, bio);
- b->read_error = bio->bi_error;
+ b->read_error = bio->bi_status;
BUG_ON(!test_bit(B_READING, &b->state));
@@ -1093,12 +1103,12 @@ static void *new_read(struct dm_bufio_client *c, sector_t block,
return NULL;
if (need_submit)
- submit_io(b, READ, b->block, read_endio);
+ submit_io(b, READ, read_endio);
wait_on_bit_io(&b->state, B_READING, TASK_UNINTERRUPTIBLE);
if (b->read_error) {
- int error = b->read_error;
+ int error = blk_status_to_errno(b->read_error);
dm_bufio_release(b);
@@ -1163,7 +1173,7 @@ void dm_bufio_prefetch(struct dm_bufio_client *c,
dm_bufio_unlock(c);
if (need_submit)
- submit_io(b, READ, b->block, read_endio);
+ submit_io(b, READ, read_endio);
dm_bufio_release(b);
cond_resched();
@@ -1248,7 +1258,8 @@ EXPORT_SYMBOL_GPL(dm_bufio_write_dirty_buffers_async);
*/
int dm_bufio_write_dirty_buffers(struct dm_bufio_client *c)
{
- int a, f;
+ blk_status_t a;
+ int f;
unsigned long buffers_processed = 0;
struct dm_buffer *b, *tmp;
@@ -1325,7 +1336,7 @@ int dm_bufio_issue_flush(struct dm_bufio_client *c)
{
struct dm_io_request io_req = {
.bi_op = REQ_OP_WRITE,
- .bi_op_flags = REQ_PREFLUSH,
+ .bi_op_flags = REQ_PREFLUSH | REQ_SYNC,
.mem.type = DM_IO_KMEM,
.mem.ptr.addr = NULL,
.client = c->dm_io,
@@ -1404,7 +1415,7 @@ retry:
old_block = b->block;
__unlink_buffer(b);
__link_buffer(b, new_block, b->list_mode);
- submit_io(b, WRITE, new_block, write_endio);
+ submit_io(b, WRITE, write_endio);
wait_on_bit_io(&b->state, B_WRITING,
TASK_UNINTERRUPTIBLE);
__unlink_buffer(b);
@@ -1549,10 +1560,10 @@ static bool __try_evict_buffer(struct dm_buffer *b, gfp_t gfp)
return true;
}
-static unsigned get_retain_buffers(struct dm_bufio_client *c)
+static unsigned long get_retain_buffers(struct dm_bufio_client *c)
{
- unsigned retain_bytes = ACCESS_ONCE(dm_bufio_retain_bytes);
- return retain_bytes / c->block_size;
+ unsigned long retain_bytes = ACCESS_ONCE(dm_bufio_retain_bytes);
+ return retain_bytes >> (c->sectors_per_block_bits + SECTOR_SHIFT);
}
static unsigned long __scan(struct dm_bufio_client *c, unsigned long nr_to_scan,
@@ -1562,7 +1573,7 @@ static unsigned long __scan(struct dm_bufio_client *c, unsigned long nr_to_scan,
struct dm_buffer *b, *tmp;
unsigned long freed = 0;
unsigned long count = nr_to_scan;
- unsigned retain_target = get_retain_buffers(c);
+ unsigned long retain_target = get_retain_buffers(c);
for (l = 0; l < LIST_SIZE; l++) {
list_for_each_entry_safe_reverse(b, tmp, &c->lru[l], lru_list) {
@@ -1761,6 +1772,12 @@ void dm_bufio_client_destroy(struct dm_bufio_client *c)
}
EXPORT_SYMBOL_GPL(dm_bufio_client_destroy);
+void dm_bufio_set_sector_offset(struct dm_bufio_client *c, sector_t start)
+{
+ c->start = start;
+}
+EXPORT_SYMBOL_GPL(dm_bufio_set_sector_offset);
+
static unsigned get_max_age_hz(void)
{
unsigned max_age = ACCESS_ONCE(dm_bufio_max_age);
@@ -1779,11 +1796,19 @@ static bool older_than(struct dm_buffer *b, unsigned long age_hz)
static void __evict_old_buffers(struct dm_bufio_client *c, unsigned long age_hz)
{
struct dm_buffer *b, *tmp;
- unsigned retain_target = get_retain_buffers(c);
- unsigned count;
+ unsigned long retain_target = get_retain_buffers(c);
+ unsigned long count;
+ LIST_HEAD(write_list);
dm_bufio_lock(c);
+ __check_watermark(c, &write_list);
+ if (unlikely(!list_empty(&write_list))) {
+ dm_bufio_unlock(c);
+ __flush_write_list(&write_list);
+ dm_bufio_lock(c);
+ }
+
count = c->n_buffers[LIST_CLEAN] + c->n_buffers[LIST_DIRTY];
list_for_each_entry_safe_reverse(b, tmp, &c->lru[LIST_CLEAN], lru_list) {
if (count <= retain_target)
@@ -1808,6 +1833,8 @@ static void cleanup_old_buffers(void)
mutex_lock(&dm_bufio_clients_lock);
+ __cache_size_refresh();
+
list_for_each_entry(c, &dm_bufio_all_clients, client_list)
__evict_old_buffers(c, max_age_hz);
@@ -1930,7 +1957,7 @@ MODULE_PARM_DESC(max_cache_size_bytes, "Size of metadata cache");
module_param_named(max_age_seconds, dm_bufio_max_age, uint, S_IRUGO | S_IWUSR);
MODULE_PARM_DESC(max_age_seconds, "Max age of a buffer in seconds");
-module_param_named(retain_bytes, dm_bufio_retain_bytes, uint, S_IRUGO | S_IWUSR);
+module_param_named(retain_bytes, dm_bufio_retain_bytes, ulong, S_IRUGO | S_IWUSR);
MODULE_PARM_DESC(retain_bytes, "Try to keep at least this many bytes cached in memory");
module_param_named(peak_allocated_bytes, dm_bufio_peak_allocated, ulong, S_IRUGO | S_IWUSR);
diff --git a/drivers/md/dm-bufio.h b/drivers/md/dm-bufio.h
index c096779a7292..b6d8f53ec15b 100644
--- a/drivers/md/dm-bufio.h
+++ b/drivers/md/dm-bufio.h
@@ -32,6 +32,13 @@ dm_bufio_client_create(struct block_device *bdev, unsigned block_size,
void dm_bufio_client_destroy(struct dm_bufio_client *c);
/*
+ * Set the sector range.
+ * When this function is called, there must be no I/O in progress on the bufio
+ * client.
+ */
+void dm_bufio_set_sector_offset(struct dm_bufio_client *c, sector_t start);
+
+/*
* WARNING: to avoid deadlocks, these conditions are observed:
*
* - At most one thread can hold at most "reserved_buffers" simultaneously.
diff --git a/drivers/md/dm-cache-background-tracker.c b/drivers/md/dm-cache-background-tracker.c
new file mode 100644
index 000000000000..707233891291
--- /dev/null
+++ b/drivers/md/dm-cache-background-tracker.c
@@ -0,0 +1,243 @@
+/*
+ * Copyright (C) 2017 Red Hat. All rights reserved.
+ *
+ * This file is released under the GPL.
+ */
+
+#include "dm-cache-background-tracker.h"
+
+/*----------------------------------------------------------------*/
+
+#define DM_MSG_PREFIX "dm-background-tracker"
+
+struct bt_work {
+ struct list_head list;
+ struct rb_node node;
+ struct policy_work work;
+};
+
+struct background_tracker {
+ unsigned max_work;
+ atomic_t pending_promotes;
+ atomic_t pending_writebacks;
+ atomic_t pending_demotes;
+
+ struct list_head issued;
+ struct list_head queued;
+ struct rb_root pending;
+
+ struct kmem_cache *work_cache;
+};
+
+struct background_tracker *btracker_create(unsigned max_work)
+{
+ struct background_tracker *b = kmalloc(sizeof(*b), GFP_KERNEL);
+
+ if (!b) {
+ DMERR("couldn't create background_tracker");
+ return NULL;
+ }
+
+ b->max_work = max_work;
+ atomic_set(&b->pending_promotes, 0);
+ atomic_set(&b->pending_writebacks, 0);
+ atomic_set(&b->pending_demotes, 0);
+
+ INIT_LIST_HEAD(&b->issued);
+ INIT_LIST_HEAD(&b->queued);
+
+ b->pending = RB_ROOT;
+ b->work_cache = KMEM_CACHE(bt_work, 0);
+ if (!b->work_cache) {
+ DMERR("couldn't create mempool for background work items");
+ kfree(b);
+ b = NULL;
+ }
+
+ return b;
+}
+EXPORT_SYMBOL_GPL(btracker_create);
+
+void btracker_destroy(struct background_tracker *b)
+{
+ kmem_cache_destroy(b->work_cache);
+ kfree(b);
+}
+EXPORT_SYMBOL_GPL(btracker_destroy);
+
+static int cmp_oblock(dm_oblock_t lhs, dm_oblock_t rhs)
+{
+ if (from_oblock(lhs) < from_oblock(rhs))
+ return -1;
+
+ if (from_oblock(rhs) < from_oblock(lhs))
+ return 1;
+
+ return 0;
+}
+
+static bool __insert_pending(struct background_tracker *b,
+ struct bt_work *nw)
+{
+ int cmp;
+ struct bt_work *w;
+ struct rb_node **new = &b->pending.rb_node, *parent = NULL;
+
+ while (*new) {
+ w = container_of(*new, struct bt_work, node);
+
+ parent = *new;
+ cmp = cmp_oblock(w->work.oblock, nw->work.oblock);
+ if (cmp < 0)
+ new = &((*new)->rb_left);
+
+ else if (cmp > 0)
+ new = &((*new)->rb_right);
+
+ else
+ /* already present */
+ return false;
+ }
+
+ rb_link_node(&nw->node, parent, new);
+ rb_insert_color(&nw->node, &b->pending);
+
+ return true;
+}
+
+static struct bt_work *__find_pending(struct background_tracker *b,
+ dm_oblock_t oblock)
+{
+ int cmp;
+ struct bt_work *w;
+ struct rb_node **new = &b->pending.rb_node;
+
+ while (*new) {
+ w = container_of(*new, struct bt_work, node);
+
+ cmp = cmp_oblock(w->work.oblock, oblock);
+ if (cmp < 0)
+ new = &((*new)->rb_left);
+
+ else if (cmp > 0)
+ new = &((*new)->rb_right);
+
+ else
+ break;
+ }
+
+ return *new ? w : NULL;
+}
+
+
+static void update_stats(struct background_tracker *b, struct policy_work *w, int delta)
+{
+ switch (w->op) {
+ case POLICY_PROMOTE:
+ atomic_add(delta, &b->pending_promotes);
+ break;
+
+ case POLICY_DEMOTE:
+ atomic_add(delta, &b->pending_demotes);
+ break;
+
+ case POLICY_WRITEBACK:
+ atomic_add(delta, &b->pending_writebacks);
+ break;
+ }
+}
+
+unsigned btracker_nr_writebacks_queued(struct background_tracker *b)
+{
+ return atomic_read(&b->pending_writebacks);
+}
+EXPORT_SYMBOL_GPL(btracker_nr_writebacks_queued);
+
+unsigned btracker_nr_demotions_queued(struct background_tracker *b)
+{
+ return atomic_read(&b->pending_demotes);
+}
+EXPORT_SYMBOL_GPL(btracker_nr_demotions_queued);
+
+static bool max_work_reached(struct background_tracker *b)
+{
+ // FIXME: finish
+ return false;
+}
+
+int btracker_queue(struct background_tracker *b,
+ struct policy_work *work,
+ struct policy_work **pwork)
+{
+ struct bt_work *w;
+
+ if (pwork)
+ *pwork = NULL;
+
+ if (max_work_reached(b))
+ return -ENOMEM;
+
+ w = kmem_cache_alloc(b->work_cache, GFP_NOWAIT);
+ if (!w)
+ return -ENOMEM;
+
+ memcpy(&w->work, work, sizeof(*work));
+
+ if (!__insert_pending(b, w)) {
+ /*
+ * There was a race, we'll just ignore this second
+ * bit of work for the same oblock.
+ */
+ kmem_cache_free(b->work_cache, w);
+ return -EINVAL;
+ }
+
+ if (pwork) {
+ *pwork = &w->work;
+ list_add(&w->list, &b->issued);
+ } else
+ list_add(&w->list, &b->queued);
+ update_stats(b, &w->work, 1);
+
+ return 0;
+}
+EXPORT_SYMBOL_GPL(btracker_queue);
+
+/*
+ * Returns -ENODATA if there's no work.
+ */
+int btracker_issue(struct background_tracker *b, struct policy_work **work)
+{
+ struct bt_work *w;
+
+ if (list_empty(&b->queued))
+ return -ENODATA;
+
+ w = list_first_entry(&b->queued, struct bt_work, list);
+ list_move(&w->list, &b->issued);
+ *work = &w->work;
+
+ return 0;
+}
+EXPORT_SYMBOL_GPL(btracker_issue);
+
+void btracker_complete(struct background_tracker *b,
+ struct policy_work *op)
+{
+ struct bt_work *w = container_of(op, struct bt_work, work);
+
+ update_stats(b, &w->work, -1);
+ rb_erase(&w->node, &b->pending);
+ list_del(&w->list);
+ kmem_cache_free(b->work_cache, w);
+}
+EXPORT_SYMBOL_GPL(btracker_complete);
+
+bool btracker_promotion_already_present(struct background_tracker *b,
+ dm_oblock_t oblock)
+{
+ return __find_pending(b, oblock) != NULL;
+}
+EXPORT_SYMBOL_GPL(btracker_promotion_already_present);
+
+/*----------------------------------------------------------------*/
diff --git a/drivers/md/dm-cache-background-tracker.h b/drivers/md/dm-cache-background-tracker.h
new file mode 100644
index 000000000000..27ab90dbc275
--- /dev/null
+++ b/drivers/md/dm-cache-background-tracker.h
@@ -0,0 +1,46 @@
+/*
+ * Copyright (C) 2017 Red Hat. All rights reserved.
+ *
+ * This file is released under the GPL.
+ */
+
+#ifndef DM_CACHE_BACKGROUND_WORK_H
+#define DM_CACHE_BACKGROUND_WORK_H
+
+#include <linux/vmalloc.h>
+#include "dm-cache-policy.h"
+
+/*----------------------------------------------------------------*/
+
+struct background_work;
+struct background_tracker;
+
+/*
+ * FIXME: discuss lack of locking in all methods.
+ */
+struct background_tracker *btracker_create(unsigned max_work);
+void btracker_destroy(struct background_tracker *b);
+
+unsigned btracker_nr_writebacks_queued(struct background_tracker *b);
+unsigned btracker_nr_demotions_queued(struct background_tracker *b);
+
+/*
+ * returns -EINVAL iff the work is already queued. -ENOMEM if the work
+ * couldn't be queued for another reason.
+ */
+int btracker_queue(struct background_tracker *b,
+ struct policy_work *work,
+ struct policy_work **pwork);
+
+/*
+ * Returns -ENODATA if there's no work.
+ */
+int btracker_issue(struct background_tracker *b, struct policy_work **work);
+void btracker_complete(struct background_tracker *b,
+ struct policy_work *op);
+bool btracker_promotion_already_present(struct background_tracker *b,
+ dm_oblock_t oblock);
+
+/*----------------------------------------------------------------*/
+
+#endif
diff --git a/drivers/md/dm-cache-metadata.c b/drivers/md/dm-cache-metadata.c
index 624fe4319b24..4a4e9c75fc4c 100644
--- a/drivers/md/dm-cache-metadata.c
+++ b/drivers/md/dm-cache-metadata.c
@@ -25,9 +25,7 @@
* defines a range of metadata versions that this module can handle.
*/
#define MIN_CACHE_VERSION 1
-#define MAX_CACHE_VERSION 1
-
-#define CACHE_METADATA_CACHE_SIZE 64
+#define MAX_CACHE_VERSION 2
/*
* 3 for btree insert +
@@ -55,6 +53,7 @@ enum mapping_bits {
/*
* The data on the cache is different from that on the origin.
+ * This flag is only used by metadata format 1.
*/
M_DIRTY = 2
};
@@ -93,12 +92,18 @@ struct cache_disk_superblock {
__le32 write_misses;
__le32 policy_version[CACHE_POLICY_VERSION_SIZE];
+
+ /*
+ * Metadata format 2 fields.
+ */
+ __le64 dirty_root;
} __packed;
struct dm_cache_metadata {
atomic_t ref_count;
struct list_head list;
+ unsigned version;
struct block_device *bdev;
struct dm_block_manager *bm;
struct dm_space_map *metadata_sm;
@@ -142,11 +147,18 @@ struct dm_cache_metadata {
bool fail_io:1;
/*
+ * Metadata format 2 fields.
+ */
+ dm_block_t dirty_root;
+ struct dm_disk_bitset dirty_info;
+
+ /*
* These structures are used when loading metadata. They're too
* big to put on the stack.
*/
struct dm_array_cursor mapping_cursor;
struct dm_array_cursor hint_cursor;
+ struct dm_bitset_cursor dirty_cursor;
};
/*-------------------------------------------------------------------
@@ -170,6 +182,7 @@ static void sb_prepare_for_write(struct dm_block_validator *v,
static int check_metadata_version(struct cache_disk_superblock *disk_super)
{
uint32_t metadata_version = le32_to_cpu(disk_super->version);
+
if (metadata_version < MIN_CACHE_VERSION || metadata_version > MAX_CACHE_VERSION) {
DMERR("Cache metadata version %u found, but only versions between %u and %u supported.",
metadata_version, MIN_CACHE_VERSION, MAX_CACHE_VERSION);
@@ -310,6 +323,11 @@ static void __copy_sm_root(struct dm_cache_metadata *cmd,
sizeof(cmd->metadata_space_map_root));
}
+static bool separate_dirty_bits(struct dm_cache_metadata *cmd)
+{
+ return cmd->version >= 2;
+}
+
static int __write_initial_superblock(struct dm_cache_metadata *cmd)
{
int r;
@@ -341,7 +359,7 @@ static int __write_initial_superblock(struct dm_cache_metadata *cmd)
disk_super->flags = 0;
memset(disk_super->uuid, 0, sizeof(disk_super->uuid));
disk_super->magic = cpu_to_le64(CACHE_SUPERBLOCK_MAGIC);
- disk_super->version = cpu_to_le32(MAX_CACHE_VERSION);
+ disk_super->version = cpu_to_le32(cmd->version);
memset(disk_super->policy_name, 0, sizeof(disk_super->policy_name));
memset(disk_super->policy_version, 0, sizeof(disk_super->policy_version));
disk_super->policy_hint_size = 0;
@@ -362,6 +380,9 @@ static int __write_initial_superblock(struct dm_cache_metadata *cmd)
disk_super->write_hits = cpu_to_le32(0);
disk_super->write_misses = cpu_to_le32(0);
+ if (separate_dirty_bits(cmd))
+ disk_super->dirty_root = cpu_to_le64(cmd->dirty_root);
+
return dm_tm_commit(cmd->tm, sblock);
}
@@ -382,6 +403,13 @@ static int __format_metadata(struct dm_cache_metadata *cmd)
if (r < 0)
goto bad;
+ if (separate_dirty_bits(cmd)) {
+ dm_disk_bitset_init(cmd->tm, &cmd->dirty_info);
+ r = dm_bitset_empty(&cmd->dirty_info, &cmd->dirty_root);
+ if (r < 0)
+ goto bad;
+ }
+
dm_disk_bitset_init(cmd->tm, &cmd->discard_info);
r = dm_bitset_empty(&cmd->discard_info, &cmd->discard_root);
if (r < 0)
@@ -407,9 +435,10 @@ bad:
static int __check_incompat_features(struct cache_disk_superblock *disk_super,
struct dm_cache_metadata *cmd)
{
- uint32_t features;
+ uint32_t incompat_flags, features;
- features = le32_to_cpu(disk_super->incompat_flags) & ~DM_CACHE_FEATURE_INCOMPAT_SUPP;
+ incompat_flags = le32_to_cpu(disk_super->incompat_flags);
+ features = incompat_flags & ~DM_CACHE_FEATURE_INCOMPAT_SUPP;
if (features) {
DMERR("could not access metadata due to unsupported optional features (%lx).",
(unsigned long)features);
@@ -470,6 +499,7 @@ static int __open_metadata(struct dm_cache_metadata *cmd)
}
__setup_mapping_info(cmd);
+ dm_disk_bitset_init(cmd->tm, &cmd->dirty_info);
dm_disk_bitset_init(cmd->tm, &cmd->discard_info);
sb_flags = le32_to_cpu(disk_super->flags);
cmd->clean_when_opened = test_bit(CLEAN_SHUTDOWN, &sb_flags);
@@ -503,7 +533,6 @@ static int __create_persistent_data_objects(struct dm_cache_metadata *cmd,
{
int r;
cmd->bm = dm_block_manager_create(cmd->bdev, DM_CACHE_METADATA_BLOCK_SIZE << SECTOR_SHIFT,
- CACHE_METADATA_CACHE_SIZE,
CACHE_MAX_CONCURRENT_LOCKS);
if (IS_ERR(cmd->bm)) {
DMERR("could not create block manager");
@@ -548,6 +577,7 @@ static unsigned long clear_clean_shutdown(unsigned long flags)
static void read_superblock_fields(struct dm_cache_metadata *cmd,
struct cache_disk_superblock *disk_super)
{
+ cmd->version = le32_to_cpu(disk_super->version);
cmd->flags = le32_to_cpu(disk_super->flags);
cmd->root = le64_to_cpu(disk_super->mapping_root);
cmd->hint_root = le64_to_cpu(disk_super->hint_root);
@@ -567,6 +597,9 @@ static void read_superblock_fields(struct dm_cache_metadata *cmd,
cmd->stats.write_hits = le32_to_cpu(disk_super->write_hits);
cmd->stats.write_misses = le32_to_cpu(disk_super->write_misses);
+ if (separate_dirty_bits(cmd))
+ cmd->dirty_root = le64_to_cpu(disk_super->dirty_root);
+
cmd->changed = false;
}
@@ -625,6 +658,13 @@ static int __commit_transaction(struct dm_cache_metadata *cmd,
*/
BUILD_BUG_ON(sizeof(struct cache_disk_superblock) > 512);
+ if (separate_dirty_bits(cmd)) {
+ r = dm_bitset_flush(&cmd->dirty_info, cmd->dirty_root,
+ &cmd->dirty_root);
+ if (r)
+ return r;
+ }
+
r = dm_bitset_flush(&cmd->discard_info, cmd->discard_root,
&cmd->discard_root);
if (r)
@@ -649,6 +689,8 @@ static int __commit_transaction(struct dm_cache_metadata *cmd,
update_flags(disk_super, mutator);
disk_super->mapping_root = cpu_to_le64(cmd->root);
+ if (separate_dirty_bits(cmd))
+ disk_super->dirty_root = cpu_to_le64(cmd->dirty_root);
disk_super->hint_root = cpu_to_le64(cmd->hint_root);
disk_super->discard_root = cpu_to_le64(cmd->discard_root);
disk_super->discard_block_size = cpu_to_le64(cmd->discard_block_size);
@@ -698,7 +740,8 @@ static void unpack_value(__le64 value_le, dm_oblock_t *block, unsigned *flags)
static struct dm_cache_metadata *metadata_open(struct block_device *bdev,
sector_t data_block_size,
bool may_format_device,
- size_t policy_hint_size)
+ size_t policy_hint_size,
+ unsigned metadata_version)
{
int r;
struct dm_cache_metadata *cmd;
@@ -709,6 +752,7 @@ static struct dm_cache_metadata *metadata_open(struct block_device *bdev,
return ERR_PTR(-ENOMEM);
}
+ cmd->version = metadata_version;
atomic_set(&cmd->ref_count, 1);
init_rwsem(&cmd->root_lock);
cmd->bdev = bdev;
@@ -757,7 +801,8 @@ static struct dm_cache_metadata *lookup(struct block_device *bdev)
static struct dm_cache_metadata *lookup_or_open(struct block_device *bdev,
sector_t data_block_size,
bool may_format_device,
- size_t policy_hint_size)
+ size_t policy_hint_size,
+ unsigned metadata_version)
{
struct dm_cache_metadata *cmd, *cmd2;
@@ -768,7 +813,8 @@ static struct dm_cache_metadata *lookup_or_open(struct block_device *bdev,
if (cmd)
return cmd;
- cmd = metadata_open(bdev, data_block_size, may_format_device, policy_hint_size);
+ cmd = metadata_open(bdev, data_block_size, may_format_device,
+ policy_hint_size, metadata_version);
if (!IS_ERR(cmd)) {
mutex_lock(&table_lock);
cmd2 = lookup(bdev);
@@ -800,10 +846,11 @@ static bool same_params(struct dm_cache_metadata *cmd, sector_t data_block_size)
struct dm_cache_metadata *dm_cache_metadata_open(struct block_device *bdev,
sector_t data_block_size,
bool may_format_device,
- size_t policy_hint_size)
+ size_t policy_hint_size,
+ unsigned metadata_version)
{
- struct dm_cache_metadata *cmd = lookup_or_open(bdev, data_block_size,
- may_format_device, policy_hint_size);
+ struct dm_cache_metadata *cmd = lookup_or_open(bdev, data_block_size, may_format_device,
+ policy_hint_size, metadata_version);
if (!IS_ERR(cmd) && !same_params(cmd, data_block_size)) {
dm_cache_metadata_close(cmd);
@@ -829,8 +876,8 @@ void dm_cache_metadata_close(struct dm_cache_metadata *cmd)
/*
* Checks that the given cache block is either unmapped or clean.
*/
-static int block_unmapped_or_clean(struct dm_cache_metadata *cmd, dm_cblock_t b,
- bool *result)
+static int block_clean_combined_dirty(struct dm_cache_metadata *cmd, dm_cblock_t b,
+ bool *result)
{
int r;
__le64 value;
@@ -838,10 +885,8 @@ static int block_unmapped_or_clean(struct dm_cache_metadata *cmd, dm_cblock_t b,
unsigned flags;
r = dm_array_get_value(&cmd->info, cmd->root, from_cblock(b), &value);
- if (r) {
- DMERR("block_unmapped_or_clean failed");
+ if (r)
return r;
- }
unpack_value(value, &ob, &flags);
*result = !((flags & M_VALID) && (flags & M_DIRTY));
@@ -849,17 +894,19 @@ static int block_unmapped_or_clean(struct dm_cache_metadata *cmd, dm_cblock_t b,
return 0;
}
-static int blocks_are_unmapped_or_clean(struct dm_cache_metadata *cmd,
- dm_cblock_t begin, dm_cblock_t end,
- bool *result)
+static int blocks_are_clean_combined_dirty(struct dm_cache_metadata *cmd,
+ dm_cblock_t begin, dm_cblock_t end,
+ bool *result)
{
int r;
*result = true;
while (begin != end) {
- r = block_unmapped_or_clean(cmd, begin, result);
- if (r)
+ r = block_clean_combined_dirty(cmd, begin, result);
+ if (r) {
+ DMERR("block_clean_combined_dirty failed");
return r;
+ }
if (!*result) {
DMERR("cache block %llu is dirty",
@@ -873,6 +920,69 @@ static int blocks_are_unmapped_or_clean(struct dm_cache_metadata *cmd,
return 0;
}
+static int blocks_are_clean_separate_dirty(struct dm_cache_metadata *cmd,
+ dm_cblock_t begin, dm_cblock_t end,
+ bool *result)
+{
+ int r;
+ bool dirty_flag;
+ *result = true;
+
+ r = dm_bitset_cursor_begin(&cmd->dirty_info, cmd->dirty_root,
+ from_cblock(cmd->cache_blocks), &cmd->dirty_cursor);
+ if (r) {
+ DMERR("%s: dm_bitset_cursor_begin for dirty failed", __func__);
+ return r;
+ }
+
+ r = dm_bitset_cursor_skip(&cmd->dirty_cursor, from_cblock(begin));
+ if (r) {
+ DMERR("%s: dm_bitset_cursor_skip for dirty failed", __func__);
+ dm_bitset_cursor_end(&cmd->dirty_cursor);
+ return r;
+ }
+
+ while (begin != end) {
+ /*
+ * We assume that unmapped blocks have their dirty bit
+ * cleared.
+ */
+ dirty_flag = dm_bitset_cursor_get_value(&cmd->dirty_cursor);
+ if (dirty_flag) {
+ DMERR("%s: cache block %llu is dirty", __func__,
+ (unsigned long long) from_cblock(begin));
+ dm_bitset_cursor_end(&cmd->dirty_cursor);
+ *result = false;
+ return 0;
+ }
+
+ begin = to_cblock(from_cblock(begin) + 1);
+ if (begin == end)
+ break;
+
+ r = dm_bitset_cursor_next(&cmd->dirty_cursor);
+ if (r) {
+ DMERR("%s: dm_bitset_cursor_next for dirty failed", __func__);
+ dm_bitset_cursor_end(&cmd->dirty_cursor);
+ return r;
+ }
+ }
+
+ dm_bitset_cursor_end(&cmd->dirty_cursor);
+
+ return 0;
+}
+
+static int blocks_are_unmapped_or_clean(struct dm_cache_metadata *cmd,
+ dm_cblock_t begin, dm_cblock_t end,
+ bool *result)
+{
+ if (separate_dirty_bits(cmd))
+ return blocks_are_clean_separate_dirty(cmd, begin, end, result);
+ else
+ return blocks_are_clean_combined_dirty(cmd, begin, end, result);
+}
+
static bool cmd_write_lock(struct dm_cache_metadata *cmd)
{
down_write(&cmd->root_lock);
@@ -950,8 +1060,18 @@ int dm_cache_resize(struct dm_cache_metadata *cmd, dm_cblock_t new_cache_size)
r = dm_array_resize(&cmd->info, cmd->root, from_cblock(cmd->cache_blocks),
from_cblock(new_cache_size),
&null_mapping, &cmd->root);
- if (!r)
- cmd->cache_blocks = new_cache_size;
+ if (r)
+ goto out;
+
+ if (separate_dirty_bits(cmd)) {
+ r = dm_bitset_resize(&cmd->dirty_info, cmd->dirty_root,
+ from_cblock(cmd->cache_blocks), from_cblock(new_cache_size),
+ false, &cmd->dirty_root);
+ if (r)
+ goto out;
+ }
+
+ cmd->cache_blocks = new_cache_size;
cmd->changed = true;
out:
@@ -995,14 +1115,6 @@ static int __clear_discard(struct dm_cache_metadata *cmd, dm_dblock_t b)
from_dblock(b), &cmd->discard_root);
}
-static int __is_discarded(struct dm_cache_metadata *cmd, dm_dblock_t b,
- bool *is_discarded)
-{
- return dm_bitset_test_bit(&cmd->discard_info, cmd->discard_root,
- from_dblock(b), &cmd->discard_root,
- is_discarded);
-}
-
static int __discard(struct dm_cache_metadata *cmd,
dm_dblock_t dblock, bool discard)
{
@@ -1032,22 +1144,38 @@ static int __load_discards(struct dm_cache_metadata *cmd,
load_discard_fn fn, void *context)
{
int r = 0;
- dm_block_t b;
- bool discard;
+ uint32_t b;
+ struct dm_bitset_cursor c;
- for (b = 0; b < from_dblock(cmd->discard_nr_blocks); b++) {
- dm_dblock_t dblock = to_dblock(b);
+ if (from_dblock(cmd->discard_nr_blocks) == 0)
+ /* nothing to do */
+ return 0;
- if (cmd->clean_when_opened) {
- r = __is_discarded(cmd, dblock, &discard);
- if (r)
- return r;
- } else
- discard = false;
+ if (cmd->clean_when_opened) {
+ r = dm_bitset_flush(&cmd->discard_info, cmd->discard_root, &cmd->discard_root);
+ if (r)
+ return r;
- r = fn(context, cmd->discard_block_size, dblock, discard);
+ r = dm_bitset_cursor_begin(&cmd->discard_info, cmd->discard_root,
+ from_dblock(cmd->discard_nr_blocks), &c);
if (r)
- break;
+ return r;
+
+ for (b = 0; b < from_dblock(cmd->discard_nr_blocks); b++) {
+ r = fn(context, cmd->discard_block_size, to_dblock(b),
+ dm_bitset_cursor_get_value(&c));
+ if (r)
+ break;
+ }
+
+ dm_bitset_cursor_end(&c);
+
+ } else {
+ for (b = 0; b < from_dblock(cmd->discard_nr_blocks); b++) {
+ r = fn(context, cmd->discard_block_size, to_dblock(b), false);
+ if (r)
+ return r;
+ }
}
return r;
@@ -1177,11 +1305,11 @@ static bool hints_array_available(struct dm_cache_metadata *cmd,
hints_array_initialized(cmd);
}
-static int __load_mapping(struct dm_cache_metadata *cmd,
- uint64_t cb, bool hints_valid,
- struct dm_array_cursor *mapping_cursor,
- struct dm_array_cursor *hint_cursor,
- load_mapping_fn fn, void *context)
+static int __load_mapping_v1(struct dm_cache_metadata *cmd,
+ uint64_t cb, bool hints_valid,
+ struct dm_array_cursor *mapping_cursor,
+ struct dm_array_cursor *hint_cursor,
+ load_mapping_fn fn, void *context)
{
int r = 0;
@@ -1206,8 +1334,51 @@ static int __load_mapping(struct dm_cache_metadata *cmd,
r = fn(context, oblock, to_cblock(cb), flags & M_DIRTY,
le32_to_cpu(hint), hints_valid);
- if (r)
- DMERR("policy couldn't load cblock");
+ if (r) {
+ DMERR("policy couldn't load cache block %llu",
+ (unsigned long long) from_cblock(to_cblock(cb)));
+ }
+ }
+
+ return r;
+}
+
+static int __load_mapping_v2(struct dm_cache_metadata *cmd,
+ uint64_t cb, bool hints_valid,
+ struct dm_array_cursor *mapping_cursor,
+ struct dm_array_cursor *hint_cursor,
+ struct dm_bitset_cursor *dirty_cursor,
+ load_mapping_fn fn, void *context)
+{
+ int r = 0;
+
+ __le64 mapping;
+ __le32 hint = 0;
+
+ __le64 *mapping_value_le;
+ __le32 *hint_value_le;
+
+ dm_oblock_t oblock;
+ unsigned flags;
+ bool dirty;
+
+ dm_array_cursor_get_value(mapping_cursor, (void **) &mapping_value_le);
+ memcpy(&mapping, mapping_value_le, sizeof(mapping));
+ unpack_value(mapping, &oblock, &flags);
+
+ if (flags & M_VALID) {
+ if (hints_valid) {
+ dm_array_cursor_get_value(hint_cursor, (void **) &hint_value_le);
+ memcpy(&hint, hint_value_le, sizeof(hint));
+ }
+
+ dirty = dm_bitset_cursor_get_value(dirty_cursor);
+ r = fn(context, oblock, to_cblock(cb), dirty,
+ le32_to_cpu(hint), hints_valid);
+ if (r) {
+ DMERR("policy couldn't load cache block %llu",
+ (unsigned long long) from_cblock(to_cblock(cb)));
+ }
}
return r;
@@ -1238,10 +1409,28 @@ static int __load_mappings(struct dm_cache_metadata *cmd,
}
}
+ if (separate_dirty_bits(cmd)) {
+ r = dm_bitset_cursor_begin(&cmd->dirty_info, cmd->dirty_root,
+ from_cblock(cmd->cache_blocks),
+ &cmd->dirty_cursor);
+ if (r) {
+ dm_array_cursor_end(&cmd->hint_cursor);
+ dm_array_cursor_end(&cmd->mapping_cursor);
+ return r;
+ }
+ }
+
for (cb = 0; ; cb++) {
- r = __load_mapping(cmd, cb, hints_valid,
- &cmd->mapping_cursor, &cmd->hint_cursor,
- fn, context);
+ if (separate_dirty_bits(cmd))
+ r = __load_mapping_v2(cmd, cb, hints_valid,
+ &cmd->mapping_cursor,
+ &cmd->hint_cursor,
+ &cmd->dirty_cursor,
+ fn, context);
+ else
+ r = __load_mapping_v1(cmd, cb, hints_valid,
+ &cmd->mapping_cursor, &cmd->hint_cursor,
+ fn, context);
if (r)
goto out;
@@ -1264,12 +1453,23 @@ static int __load_mappings(struct dm_cache_metadata *cmd,
goto out;
}
}
+
+ if (separate_dirty_bits(cmd)) {
+ r = dm_bitset_cursor_next(&cmd->dirty_cursor);
+ if (r) {
+ DMERR("dm_bitset_cursor_next for dirty failed");
+ goto out;
+ }
+ }
}
out:
dm_array_cursor_end(&cmd->mapping_cursor);
if (hints_valid)
dm_array_cursor_end(&cmd->hint_cursor);
+ if (separate_dirty_bits(cmd))
+ dm_bitset_cursor_end(&cmd->dirty_cursor);
+
return r;
}
@@ -1352,13 +1552,55 @@ static int __dirty(struct dm_cache_metadata *cmd, dm_cblock_t cblock, bool dirty
}
-int dm_cache_set_dirty(struct dm_cache_metadata *cmd,
- dm_cblock_t cblock, bool dirty)
+static int __set_dirty_bits_v1(struct dm_cache_metadata *cmd, unsigned nr_bits, unsigned long *bits)
+{
+ int r;
+ unsigned i;
+ for (i = 0; i < nr_bits; i++) {
+ r = __dirty(cmd, to_cblock(i), test_bit(i, bits));
+ if (r)
+ return r;
+ }
+
+ return 0;
+}
+
+static int is_dirty_callback(uint32_t index, bool *value, void *context)
+{
+ unsigned long *bits = context;
+ *value = test_bit(index, bits);
+ return 0;
+}
+
+static int __set_dirty_bits_v2(struct dm_cache_metadata *cmd, unsigned nr_bits, unsigned long *bits)
+{
+ int r = 0;
+
+ /* nr_bits is really just a sanity check */
+ if (nr_bits != from_cblock(cmd->cache_blocks)) {
+ DMERR("dirty bitset is wrong size");
+ return -EINVAL;
+ }
+
+ r = dm_bitset_del(&cmd->dirty_info, cmd->dirty_root);
+ if (r)
+ return r;
+
+ cmd->changed = true;
+ return dm_bitset_new(&cmd->dirty_info, &cmd->dirty_root, nr_bits, is_dirty_callback, bits);
+}
+
+int dm_cache_set_dirty_bits(struct dm_cache_metadata *cmd,
+ unsigned nr_bits,
+ unsigned long *bits)
{
int r;
WRITE_LOCK(cmd);
- r = __dirty(cmd, cblock, dirty);
+ if (separate_dirty_bits(cmd))
+ r = __set_dirty_bits_v2(cmd, nr_bits, bits);
+ else
+ r = __set_dirty_bits_v1(cmd, nr_bits, bits);
WRITE_UNLOCK(cmd);
return r;
@@ -1382,17 +1624,19 @@ void dm_cache_metadata_set_stats(struct dm_cache_metadata *cmd,
int dm_cache_commit(struct dm_cache_metadata *cmd, bool clean_shutdown)
{
- int r;
+ int r = -EINVAL;
flags_mutator mutator = (clean_shutdown ? set_clean_shutdown :
clear_clean_shutdown);
WRITE_LOCK(cmd);
+ if (cmd->fail_io)
+ goto out;
+
r = __commit_transaction(cmd, mutator);
if (r)
goto out;
r = __begin_transaction(cmd);
-
out:
WRITE_UNLOCK(cmd);
return r;
@@ -1404,7 +1648,8 @@ int dm_cache_get_free_metadata_block_count(struct dm_cache_metadata *cmd,
int r = -EINVAL;
READ_LOCK(cmd);
- r = dm_sm_get_nr_free(cmd->metadata_sm, result);
+ if (!cmd->fail_io)
+ r = dm_sm_get_nr_free(cmd->metadata_sm, result);
READ_UNLOCK(cmd);
return r;
@@ -1416,7 +1661,8 @@ int dm_cache_get_metadata_dev_size(struct dm_cache_metadata *cmd,
int r = -EINVAL;
READ_LOCK(cmd);
- r = dm_sm_get_nr_blocks(cmd->metadata_sm, result);
+ if (!cmd->fail_io)
+ r = dm_sm_get_nr_blocks(cmd->metadata_sm, result);
READ_UNLOCK(cmd);
return r;
diff --git a/drivers/md/dm-cache-metadata.h b/drivers/md/dm-cache-metadata.h
index 8528744195e5..179ed5bf81a3 100644
--- a/drivers/md/dm-cache-metadata.h
+++ b/drivers/md/dm-cache-metadata.h
@@ -45,18 +45,22 @@
* As these various flags are defined they should be added to the
* following masks.
*/
+
#define DM_CACHE_FEATURE_COMPAT_SUPP 0UL
#define DM_CACHE_FEATURE_COMPAT_RO_SUPP 0UL
#define DM_CACHE_FEATURE_INCOMPAT_SUPP 0UL
+struct dm_cache_metadata;
+
/*
- * Reopens or creates a new, empty metadata volume.
- * Returns an ERR_PTR on failure.
+ * Reopens or creates a new, empty metadata volume. Returns an ERR_PTR on
+ * failure. If reopening then features must match.
*/
struct dm_cache_metadata *dm_cache_metadata_open(struct block_device *bdev,
sector_t data_block_size,
bool may_format_device,
- size_t policy_hint_size);
+ size_t policy_hint_size,
+ unsigned metadata_version);
void dm_cache_metadata_close(struct dm_cache_metadata *cmd);
@@ -91,7 +95,8 @@ int dm_cache_load_mappings(struct dm_cache_metadata *cmd,
load_mapping_fn fn,
void *context);
-int dm_cache_set_dirty(struct dm_cache_metadata *cmd, dm_cblock_t cblock, bool dirty);
+int dm_cache_set_dirty_bits(struct dm_cache_metadata *cmd,
+ unsigned nr_bits, unsigned long *bits);
struct dm_cache_statistics {
uint32_t read_hits;
diff --git a/drivers/md/dm-cache-policy-cleaner.c b/drivers/md/dm-cache-policy-cleaner.c
deleted file mode 100644
index 2e8a8f1d8358..000000000000
--- a/drivers/md/dm-cache-policy-cleaner.c
+++ /dev/null
@@ -1,469 +0,0 @@
-/*
- * Copyright (C) 2012 Red Hat. All rights reserved.
- *
- * writeback cache policy supporting flushing out dirty cache blocks.
- *
- * This file is released under the GPL.
- */
-
-#include "dm-cache-policy.h"
-#include "dm.h"
-
-#include <linux/hash.h>
-#include <linux/module.h>
-#include <linux/slab.h>
-#include <linux/vmalloc.h>
-
-/*----------------------------------------------------------------*/
-
-#define DM_MSG_PREFIX "cache cleaner"
-
-/* Cache entry struct. */
-struct wb_cache_entry {
- struct list_head list;
- struct hlist_node hlist;
-
- dm_oblock_t oblock;
- dm_cblock_t cblock;
- bool dirty:1;
- bool pending:1;
-};
-
-struct hash {
- struct hlist_head *table;
- dm_block_t hash_bits;
- unsigned nr_buckets;
-};
-
-struct policy {
- struct dm_cache_policy policy;
- spinlock_t lock;
-
- struct list_head free;
- struct list_head clean;
- struct list_head clean_pending;
- struct list_head dirty;
-
- /*
- * We know exactly how many cblocks will be needed,
- * so we can allocate them up front.
- */
- dm_cblock_t cache_size, nr_cblocks_allocated;
- struct wb_cache_entry *cblocks;
- struct hash chash;
-};
-
-/*----------------------------------------------------------------------------*/
-
-/*
- * Low-level functions.
- */
-static unsigned next_power(unsigned n, unsigned min)
-{
- return roundup_pow_of_two(max(n, min));
-}
-
-static struct policy *to_policy(struct dm_cache_policy *p)
-{
- return container_of(p, struct policy, policy);
-}
-
-static struct list_head *list_pop(struct list_head *q)
-{
- struct list_head *r = q->next;
-
- list_del(r);
-
- return r;
-}
-
-/*----------------------------------------------------------------------------*/
-
-/* Allocate/free various resources. */
-static int alloc_hash(struct hash *hash, unsigned elts)
-{
- hash->nr_buckets = next_power(elts >> 4, 16);
- hash->hash_bits = __ffs(hash->nr_buckets);
- hash->table = vzalloc(sizeof(*hash->table) * hash->nr_buckets);
-
- return hash->table ? 0 : -ENOMEM;
-}
-
-static void free_hash(struct hash *hash)
-{
- vfree(hash->table);
-}
-
-static int alloc_cache_blocks_with_hash(struct policy *p, dm_cblock_t cache_size)
-{
- int r = -ENOMEM;
-
- p->cblocks = vzalloc(sizeof(*p->cblocks) * from_cblock(cache_size));
- if (p->cblocks) {
- unsigned u = from_cblock(cache_size);
-
- while (u--)
- list_add(&p->cblocks[u].list, &p->free);
-
- p->nr_cblocks_allocated = 0;
-
- /* Cache entries hash. */
- r = alloc_hash(&p->chash, from_cblock(cache_size));
- if (r)
- vfree(p->cblocks);
- }
-
- return r;
-}
-
-static void free_cache_blocks_and_hash(struct policy *p)
-{
- free_hash(&p->chash);
- vfree(p->cblocks);
-}
-
-static struct wb_cache_entry *alloc_cache_entry(struct policy *p)
-{
- struct wb_cache_entry *e;
-
- BUG_ON(from_cblock(p->nr_cblocks_allocated) >= from_cblock(p->cache_size));
-
- e = list_entry(list_pop(&p->free), struct wb_cache_entry, list);
- p->nr_cblocks_allocated = to_cblock(from_cblock(p->nr_cblocks_allocated) + 1);
-
- return e;
-}
-
-/*----------------------------------------------------------------------------*/
-
-/* Hash functions (lookup, insert, remove). */
-static struct wb_cache_entry *lookup_cache_entry(struct policy *p, dm_oblock_t oblock)
-{
- struct hash *hash = &p->chash;
- unsigned h = hash_64(from_oblock(oblock), hash->hash_bits);
- struct wb_cache_entry *cur;
- struct hlist_head *bucket = &hash->table[h];
-
- hlist_for_each_entry(cur, bucket, hlist) {
- if (cur->oblock == oblock) {
- /* Move upfront bucket for faster access. */
- hlist_del(&cur->hlist);
- hlist_add_head(&cur->hlist, bucket);
- return cur;
- }
- }
-
- return NULL;
-}
-
-static void insert_cache_hash_entry(struct policy *p, struct wb_cache_entry *e)
-{
- unsigned h = hash_64(from_oblock(e->oblock), p->chash.hash_bits);
-
- hlist_add_head(&e->hlist, &p->chash.table[h]);
-}
-
-static void remove_cache_hash_entry(struct wb_cache_entry *e)
-{
- hlist_del(&e->hlist);
-}
-
-/* Public interface (see dm-cache-policy.h */
-static int wb_map(struct dm_cache_policy *pe, dm_oblock_t oblock,
- bool can_block, bool can_migrate, bool discarded_oblock,
- struct bio *bio, struct policy_locker *locker,
- struct policy_result *result)
-{
- struct policy *p = to_policy(pe);
- struct wb_cache_entry *e;
- unsigned long flags;
-
- result->op = POLICY_MISS;
-
- if (can_block)
- spin_lock_irqsave(&p->lock, flags);
-
- else if (!spin_trylock_irqsave(&p->lock, flags))
- return -EWOULDBLOCK;
-
- e = lookup_cache_entry(p, oblock);
- if (e) {
- result->op = POLICY_HIT;
- result->cblock = e->cblock;
-
- }
-
- spin_unlock_irqrestore(&p->lock, flags);
-
- return 0;
-}
-
-static int wb_lookup(struct dm_cache_policy *pe, dm_oblock_t oblock, dm_cblock_t *cblock)
-{
- int r;
- struct policy *p = to_policy(pe);
- struct wb_cache_entry *e;
- unsigned long flags;
-
- if (!spin_trylock_irqsave(&p->lock, flags))
- return -EWOULDBLOCK;
-
- e = lookup_cache_entry(p, oblock);
- if (e) {
- *cblock = e->cblock;
- r = 0;
-
- } else
- r = -ENOENT;
-
- spin_unlock_irqrestore(&p->lock, flags);
-
- return r;
-}
-
-static void __set_clear_dirty(struct dm_cache_policy *pe, dm_oblock_t oblock, bool set)
-{
- struct policy *p = to_policy(pe);
- struct wb_cache_entry *e;
-
- e = lookup_cache_entry(p, oblock);
- BUG_ON(!e);
-
- if (set) {
- if (!e->dirty) {
- e->dirty = true;
- list_move(&e->list, &p->dirty);
- }
-
- } else {
- if (e->dirty) {
- e->pending = false;
- e->dirty = false;
- list_move(&e->list, &p->clean);
- }
- }
-}
-
-static void wb_set_dirty(struct dm_cache_policy *pe, dm_oblock_t oblock)
-{
- struct policy *p = to_policy(pe);
- unsigned long flags;
-
- spin_lock_irqsave(&p->lock, flags);
- __set_clear_dirty(pe, oblock, true);
- spin_unlock_irqrestore(&p->lock, flags);
-}
-
-static void wb_clear_dirty(struct dm_cache_policy *pe, dm_oblock_t oblock)
-{
- struct policy *p = to_policy(pe);
- unsigned long flags;
-
- spin_lock_irqsave(&p->lock, flags);
- __set_clear_dirty(pe, oblock, false);
- spin_unlock_irqrestore(&p->lock, flags);
-}
-
-static void add_cache_entry(struct policy *p, struct wb_cache_entry *e)
-{
- insert_cache_hash_entry(p, e);
- if (e->dirty)
- list_add(&e->list, &p->dirty);
- else
- list_add(&e->list, &p->clean);
-}
-
-static int wb_load_mapping(struct dm_cache_policy *pe,
- dm_oblock_t oblock, dm_cblock_t cblock,
- uint32_t hint, bool hint_valid)
-{
- int r;
- struct policy *p = to_policy(pe);
- struct wb_cache_entry *e = alloc_cache_entry(p);
-
- if (e) {
- e->cblock = cblock;
- e->oblock = oblock;
- e->dirty = false; /* blocks default to clean */
- add_cache_entry(p, e);
- r = 0;
-
- } else
- r = -ENOMEM;
-
- return r;
-}
-
-static void wb_destroy(struct dm_cache_policy *pe)
-{
- struct policy *p = to_policy(pe);
-
- free_cache_blocks_and_hash(p);
- kfree(p);
-}
-
-static struct wb_cache_entry *__wb_force_remove_mapping(struct policy *p, dm_oblock_t oblock)
-{
- struct wb_cache_entry *r = lookup_cache_entry(p, oblock);
-
- BUG_ON(!r);
-
- remove_cache_hash_entry(r);
- list_del(&r->list);
-
- return r;
-}
-
-static void wb_remove_mapping(struct dm_cache_policy *pe, dm_oblock_t oblock)
-{
- struct policy *p = to_policy(pe);
- struct wb_cache_entry *e;
- unsigned long flags;
-
- spin_lock_irqsave(&p->lock, flags);
- e = __wb_force_remove_mapping(p, oblock);
- list_add_tail(&e->list, &p->free);
- BUG_ON(!from_cblock(p->nr_cblocks_allocated));
- p->nr_cblocks_allocated = to_cblock(from_cblock(p->nr_cblocks_allocated) - 1);
- spin_unlock_irqrestore(&p->lock, flags);
-}
-
-static void wb_force_mapping(struct dm_cache_policy *pe,
- dm_oblock_t current_oblock, dm_oblock_t oblock)
-{
- struct policy *p = to_policy(pe);
- struct wb_cache_entry *e;
- unsigned long flags;
-
- spin_lock_irqsave(&p->lock, flags);
- e = __wb_force_remove_mapping(p, current_oblock);
- e->oblock = oblock;
- add_cache_entry(p, e);
- spin_unlock_irqrestore(&p->lock, flags);
-}
-
-static struct wb_cache_entry *get_next_dirty_entry(struct policy *p)
-{
- struct list_head *l;
- struct wb_cache_entry *r;
-
- if (list_empty(&p->dirty))
- return NULL;
-
- l = list_pop(&p->dirty);
- r = container_of(l, struct wb_cache_entry, list);
- list_add(l, &p->clean_pending);
-
- return r;
-}
-
-static int wb_writeback_work(struct dm_cache_policy *pe,
- dm_oblock_t *oblock,
- dm_cblock_t *cblock,
- bool critical_only)
-{
- int r = -ENOENT;
- struct policy *p = to_policy(pe);
- struct wb_cache_entry *e;
- unsigned long flags;
-
- spin_lock_irqsave(&p->lock, flags);
-
- e = get_next_dirty_entry(p);
- if (e) {
- *oblock = e->oblock;
- *cblock = e->cblock;
- r = 0;
- }
-
- spin_unlock_irqrestore(&p->lock, flags);
-
- return r;
-}
-
-static dm_cblock_t wb_residency(struct dm_cache_policy *pe)
-{
- return to_policy(pe)->nr_cblocks_allocated;
-}
-
-/* Init the policy plugin interface function pointers. */
-static void init_policy_functions(struct policy *p)
-{
- p->policy.destroy = wb_destroy;
- p->policy.map = wb_map;
- p->policy.lookup = wb_lookup;
- p->policy.set_dirty = wb_set_dirty;
- p->policy.clear_dirty = wb_clear_dirty;
- p->policy.load_mapping = wb_load_mapping;
- p->policy.get_hint = NULL;
- p->policy.remove_mapping = wb_remove_mapping;
- p->policy.writeback_work = wb_writeback_work;
- p->policy.force_mapping = wb_force_mapping;
- p->policy.residency = wb_residency;
- p->policy.tick = NULL;
-}
-
-static struct dm_cache_policy *wb_create(dm_cblock_t cache_size,
- sector_t origin_size,
- sector_t cache_block_size)
-{
- int r;
- struct policy *p = kzalloc(sizeof(*p), GFP_KERNEL);
-
- if (!p)
- return NULL;
-
- init_policy_functions(p);
- INIT_LIST_HEAD(&p->free);
- INIT_LIST_HEAD(&p->clean);
- INIT_LIST_HEAD(&p->clean_pending);
- INIT_LIST_HEAD(&p->dirty);
-
- p->cache_size = cache_size;
- spin_lock_init(&p->lock);
-
- /* Allocate cache entry structs and add them to free list. */
- r = alloc_cache_blocks_with_hash(p, cache_size);
- if (!r)
- return &p->policy;
-
- kfree(p);
-
- return NULL;
-}
-/*----------------------------------------------------------------------------*/
-
-static struct dm_cache_policy_type wb_policy_type = {
- .name = "cleaner",
- .version = {1, 0, 0},
- .hint_size = 4,
- .owner = THIS_MODULE,
- .create = wb_create
-};
-
-static int __init wb_init(void)
-{
- int r = dm_cache_policy_register(&wb_policy_type);
-
- if (r < 0)
- DMERR("register failed %d", r);
- else
- DMINFO("version %u.%u.%u loaded",
- wb_policy_type.version[0],
- wb_policy_type.version[1],
- wb_policy_type.version[2]);
-
- return r;
-}
-
-static void __exit wb_exit(void)
-{
- dm_cache_policy_unregister(&wb_policy_type);
-}
-
-module_init(wb_init);
-module_exit(wb_exit);
-
-MODULE_AUTHOR("Heinz Mauelshagen <[email protected]>");
-MODULE_LICENSE("GPL");
-MODULE_DESCRIPTION("cleaner cache policy");
diff --git a/drivers/md/dm-cache-policy-internal.h b/drivers/md/dm-cache-policy-internal.h
index 808ee0e2b2c4..56f0a23f698c 100644
--- a/drivers/md/dm-cache-policy-internal.h
+++ b/drivers/md/dm-cache-policy-internal.h
@@ -12,70 +12,65 @@
/*----------------------------------------------------------------*/
-/*
- * Little inline functions that simplify calling the policy methods.
- */
-static inline int policy_map(struct dm_cache_policy *p, dm_oblock_t oblock,
- bool can_block, bool can_migrate, bool discarded_oblock,
- struct bio *bio, struct policy_locker *locker,
- struct policy_result *result)
+static inline int policy_lookup(struct dm_cache_policy *p, dm_oblock_t oblock, dm_cblock_t *cblock,
+ int data_dir, bool fast_copy, bool *background_queued)
{
- return p->map(p, oblock, can_block, can_migrate, discarded_oblock, bio, locker, result);
+ return p->lookup(p, oblock, cblock, data_dir, fast_copy, background_queued);
}
-static inline int policy_lookup(struct dm_cache_policy *p, dm_oblock_t oblock, dm_cblock_t *cblock)
+static inline int policy_lookup_with_work(struct dm_cache_policy *p,
+ dm_oblock_t oblock, dm_cblock_t *cblock,
+ int data_dir, bool fast_copy,
+ struct policy_work **work)
{
- BUG_ON(!p->lookup);
- return p->lookup(p, oblock, cblock);
-}
+ if (!p->lookup_with_work) {
+ *work = NULL;
+ return p->lookup(p, oblock, cblock, data_dir, fast_copy, NULL);
+ }
-static inline void policy_set_dirty(struct dm_cache_policy *p, dm_oblock_t oblock)
-{
- if (p->set_dirty)
- p->set_dirty(p, oblock);
+ return p->lookup_with_work(p, oblock, cblock, data_dir, fast_copy, work);
}
-static inline void policy_clear_dirty(struct dm_cache_policy *p, dm_oblock_t oblock)
+static inline int policy_get_background_work(struct dm_cache_policy *p,
+ bool idle, struct policy_work **result)
{
- if (p->clear_dirty)
- p->clear_dirty(p, oblock);
+ return p->get_background_work(p, idle, result);
}
-static inline int policy_load_mapping(struct dm_cache_policy *p,
- dm_oblock_t oblock, dm_cblock_t cblock,
- uint32_t hint, bool hint_valid)
+static inline void policy_complete_background_work(struct dm_cache_policy *p,
+ struct policy_work *work,
+ bool success)
{
- return p->load_mapping(p, oblock, cblock, hint, hint_valid);
+ return p->complete_background_work(p, work, success);
}
-static inline uint32_t policy_get_hint(struct dm_cache_policy *p,
- dm_cblock_t cblock)
+static inline void policy_set_dirty(struct dm_cache_policy *p, dm_cblock_t cblock)
{
- return p->get_hint ? p->get_hint(p, cblock) : 0;
+ p->set_dirty(p, cblock);
}
-static inline int policy_writeback_work(struct dm_cache_policy *p,
- dm_oblock_t *oblock,
- dm_cblock_t *cblock,
- bool critical_only)
+static inline void policy_clear_dirty(struct dm_cache_policy *p, dm_cblock_t cblock)
{
- return p->writeback_work ? p->writeback_work(p, oblock, cblock, critical_only) : -ENOENT;
+ p->clear_dirty(p, cblock);
}
-static inline void policy_remove_mapping(struct dm_cache_policy *p, dm_oblock_t oblock)
+static inline int policy_load_mapping(struct dm_cache_policy *p,
+ dm_oblock_t oblock, dm_cblock_t cblock,
+ bool dirty, uint32_t hint, bool hint_valid)
{
- p->remove_mapping(p, oblock);
+ return p->load_mapping(p, oblock, cblock, dirty, hint, hint_valid);
}
-static inline int policy_remove_cblock(struct dm_cache_policy *p, dm_cblock_t cblock)
+static inline int policy_invalidate_mapping(struct dm_cache_policy *p,
+ dm_cblock_t cblock)
{
- return p->remove_cblock(p, cblock);
+ return p->invalidate_mapping(p, cblock);
}
-static inline void policy_force_mapping(struct dm_cache_policy *p,
- dm_oblock_t current_oblock, dm_oblock_t new_oblock)
+static inline uint32_t policy_get_hint(struct dm_cache_policy *p,
+ dm_cblock_t cblock)
{
- return p->force_mapping(p, current_oblock, new_oblock);
+ return p->get_hint ? p->get_hint(p, cblock) : 0;
}
static inline dm_cblock_t policy_residency(struct dm_cache_policy *p)
@@ -107,6 +102,11 @@ static inline int policy_set_config_value(struct dm_cache_policy *p,
return p->set_config_value ? p->set_config_value(p, key, value) : -EINVAL;
}
+static inline void policy_allow_migrations(struct dm_cache_policy *p, bool allow)
+{
+ return p->allow_migrations(p, allow);
+}
+
/*----------------------------------------------------------------*/
/*
diff --git a/drivers/md/dm-cache-policy-smq.c b/drivers/md/dm-cache-policy-smq.c
index f19c6930a67c..e5eb9c9b4bc8 100644
--- a/drivers/md/dm-cache-policy-smq.c
+++ b/drivers/md/dm-cache-policy-smq.c
@@ -4,8 +4,9 @@
* This file is released under the GPL.
*/
-#include "dm-cache-policy.h"
+#include "dm-cache-background-tracker.h"
#include "dm-cache-policy-internal.h"
+#include "dm-cache-policy.h"
#include "dm.h"
#include <linux/hash.h>
@@ -38,10 +39,11 @@ struct entry {
unsigned hash_next:28;
unsigned prev:28;
unsigned next:28;
- unsigned level:7;
+ unsigned level:6;
bool dirty:1;
bool allocated:1;
bool sentinel:1;
+ bool pending_work:1;
dm_oblock_t oblock;
};
@@ -279,14 +281,28 @@ static unsigned q_size(struct queue *q)
*/
static void q_push(struct queue *q, struct entry *e)
{
+ BUG_ON(e->pending_work);
+
if (!e->sentinel)
q->nr_elts++;
l_add_tail(q->es, q->qs + e->level, e);
}
+static void q_push_front(struct queue *q, struct entry *e)
+{
+ BUG_ON(e->pending_work);
+
+ if (!e->sentinel)
+ q->nr_elts++;
+
+ l_add_head(q->es, q->qs + e->level, e);
+}
+
static void q_push_before(struct queue *q, struct entry *old, struct entry *e)
{
+ BUG_ON(e->pending_work);
+
if (!e->sentinel)
q->nr_elts++;
@@ -336,19 +352,6 @@ static struct entry *q_pop(struct queue *q)
}
/*
- * Pops an entry from a level that is not past a sentinel.
- */
-static struct entry *q_pop_old(struct queue *q, unsigned max_level)
-{
- struct entry *e = q_peek(q, max_level, false);
-
- if (e)
- q_del(q, e);
-
- return e;
-}
-
-/*
* This function assumes there is a non-sentinel entry to pop. It's only
* used by redistribute, so we know this is true. It also doesn't adjust
* the q->nr_elts count.
@@ -446,45 +449,49 @@ static void q_redistribute(struct queue *q)
break;
e->level = level + 1u;
- l_add_head(q->es, l_above, e);
+ l_add_tail(q->es, l_above, e);
}
}
}
-static void q_requeue_before(struct queue *q, struct entry *dest, struct entry *e, unsigned extra_levels)
+static void q_requeue(struct queue *q, struct entry *e, unsigned extra_levels,
+ struct entry *s1, struct entry *s2)
{
struct entry *de;
- unsigned new_level;
-
- q_del(q, e);
+ unsigned sentinels_passed = 0;
+ unsigned new_level = min(q->nr_levels - 1u, e->level + extra_levels);
+ /* try and find an entry to swap with */
if (extra_levels && (e->level < q->nr_levels - 1u)) {
- new_level = min(q->nr_levels - 1u, e->level + extra_levels);
- for (de = l_head(q->es, q->qs + new_level); de; de = l_next(q->es, de)) {
- if (de->sentinel)
- continue;
+ for (de = l_head(q->es, q->qs + new_level); de && de->sentinel; de = l_next(q->es, de))
+ sentinels_passed++;
+ if (de) {
q_del(q, de);
de->level = e->level;
+ if (s1) {
+ switch (sentinels_passed) {
+ case 0:
+ q_push_before(q, s1, de);
+ break;
+
+ case 1:
+ q_push_before(q, s2, de);
+ break;
- if (dest)
- q_push_before(q, dest, de);
- else
+ default:
+ q_push(q, de);
+ }
+ } else
q_push(q, de);
- break;
}
-
- e->level = new_level;
}
+ q_del(q, e);
+ e->level = new_level;
q_push(q, e);
}
-static void q_requeue(struct queue *q, struct entry *e, unsigned extra_levels)
-{
- q_requeue_before(q, NULL, e, extra_levels);
-}
-
/*----------------------------------------------------------------*/
#define FP_SHIFT 8
@@ -550,7 +557,7 @@ static enum performance stats_assess(struct stats *s)
/*----------------------------------------------------------------*/
-struct hash_table {
+struct smq_hash_table {
struct entry_space *es;
unsigned long long hash_bits;
unsigned *buckets;
@@ -560,7 +567,7 @@ struct hash_table {
* All cache entries are stored in a chained hash table. To save space we
* use indexing again, and only store indexes to the next entry.
*/
-static int h_init(struct hash_table *ht, struct entry_space *es, unsigned nr_entries)
+static int h_init(struct smq_hash_table *ht, struct entry_space *es, unsigned nr_entries)
{
unsigned i, nr_buckets;
@@ -578,34 +585,34 @@ static int h_init(struct hash_table *ht, struct entry_space *es, unsigned nr_ent
return 0;
}
-static void h_exit(struct hash_table *ht)
+static void h_exit(struct smq_hash_table *ht)
{
vfree(ht->buckets);
}
-static struct entry *h_head(struct hash_table *ht, unsigned bucket)
+static struct entry *h_head(struct smq_hash_table *ht, unsigned bucket)
{
return to_entry(ht->es, ht->buckets[bucket]);
}
-static struct entry *h_next(struct hash_table *ht, struct entry *e)
+static struct entry *h_next(struct smq_hash_table *ht, struct entry *e)
{
return to_entry(ht->es, e->hash_next);
}
-static void __h_insert(struct hash_table *ht, unsigned bucket, struct entry *e)
+static void __h_insert(struct smq_hash_table *ht, unsigned bucket, struct entry *e)
{
e->hash_next = ht->buckets[bucket];
ht->buckets[bucket] = to_index(ht->es, e);
}
-static void h_insert(struct hash_table *ht, struct entry *e)
+static void h_insert(struct smq_hash_table *ht, struct entry *e)
{
unsigned h = hash_64(from_oblock(e->oblock), ht->hash_bits);
__h_insert(ht, h, e);
}
-static struct entry *__h_lookup(struct hash_table *ht, unsigned h, dm_oblock_t oblock,
+static struct entry *__h_lookup(struct smq_hash_table *ht, unsigned h, dm_oblock_t oblock,
struct entry **prev)
{
struct entry *e;
@@ -621,7 +628,7 @@ static struct entry *__h_lookup(struct hash_table *ht, unsigned h, dm_oblock_t o
return NULL;
}
-static void __h_unlink(struct hash_table *ht, unsigned h,
+static void __h_unlink(struct smq_hash_table *ht, unsigned h,
struct entry *e, struct entry *prev)
{
if (prev)
@@ -633,7 +640,7 @@ static void __h_unlink(struct hash_table *ht, unsigned h,
/*
* Also moves each entry to the front of the bucket.
*/
-static struct entry *h_lookup(struct hash_table *ht, dm_oblock_t oblock)
+static struct entry *h_lookup(struct smq_hash_table *ht, dm_oblock_t oblock)
{
struct entry *e, *prev;
unsigned h = hash_64(from_oblock(oblock), ht->hash_bits);
@@ -651,7 +658,7 @@ static struct entry *h_lookup(struct hash_table *ht, dm_oblock_t oblock)
return e;
}
-static void h_remove(struct hash_table *ht, struct entry *e)
+static void h_remove(struct smq_hash_table *ht, struct entry *e)
{
unsigned h = hash_64(from_oblock(e->oblock), ht->hash_bits);
struct entry *prev;
@@ -699,7 +706,10 @@ static void init_entry(struct entry *e)
e->next = INDEXER_NULL;
e->prev = INDEXER_NULL;
e->level = 0u;
+ e->dirty = true; /* FIXME: audit */
e->allocated = true;
+ e->sentinel = false;
+ e->pending_work = false;
}
static struct entry *alloc_entry(struct entry_alloc *ea)
@@ -762,11 +772,11 @@ static struct entry *get_entry(struct entry_alloc *ea, unsigned index)
#define NR_HOTSPOT_LEVELS 64u
#define NR_CACHE_LEVELS 64u
-#define WRITEBACK_PERIOD (10 * HZ)
-#define DEMOTE_PERIOD (60 * HZ)
+#define WRITEBACK_PERIOD (10ul * HZ)
+#define DEMOTE_PERIOD (60ul * HZ)
#define HOTSPOT_UPDATE_PERIOD (HZ)
-#define CACHE_UPDATE_PERIOD (10u * HZ)
+#define CACHE_UPDATE_PERIOD (60ul * HZ)
struct smq_policy {
struct dm_cache_policy policy;
@@ -814,8 +824,8 @@ struct smq_policy {
* The hash tables allows us to quickly find an entry by origin
* block.
*/
- struct hash_table table;
- struct hash_table hotspot_table;
+ struct smq_hash_table table;
+ struct smq_hash_table hotspot_table;
bool current_writeback_sentinels;
unsigned long next_writeback_period;
@@ -828,6 +838,10 @@ struct smq_policy {
unsigned long next_hotspot_period;
unsigned long next_cache_period;
+
+ struct background_tracker *bg_work;
+
+ bool migrations_allowed;
};
/*----------------------------------------------------------------*/
@@ -876,15 +890,15 @@ static void __update_demote_sentinels(struct smq_policy *mq)
static void update_sentinels(struct smq_policy *mq)
{
if (time_after(jiffies, mq->next_writeback_period)) {
- __update_writeback_sentinels(mq);
mq->next_writeback_period = jiffies + WRITEBACK_PERIOD;
mq->current_writeback_sentinels = !mq->current_writeback_sentinels;
+ __update_writeback_sentinels(mq);
}
if (time_after(jiffies, mq->next_demote_period)) {
- __update_demote_sentinels(mq);
mq->next_demote_period = jiffies + DEMOTE_PERIOD;
mq->current_demote_sentinels = !mq->current_demote_sentinels;
+ __update_demote_sentinels(mq);
}
}
@@ -920,55 +934,40 @@ static void sentinels_init(struct smq_policy *mq)
/*----------------------------------------------------------------*/
-/*
- * These methods tie together the dirty queue, clean queue and hash table.
- */
-static void push_new(struct smq_policy *mq, struct entry *e)
+static void del_queue(struct smq_policy *mq, struct entry *e)
{
- struct queue *q = e->dirty ? &mq->dirty : &mq->clean;
- h_insert(&mq->table, e);
- q_push(q, e);
+ q_del(e->dirty ? &mq->dirty : &mq->clean, e);
}
-static void push(struct smq_policy *mq, struct entry *e)
+static void push_queue(struct smq_policy *mq, struct entry *e)
{
- struct entry *sentinel;
-
- h_insert(&mq->table, e);
-
- /*
- * Punch this into the queue just in front of the sentinel, to
- * ensure it's cleaned straight away.
- */
- if (e->dirty) {
- sentinel = writeback_sentinel(mq, e->level);
- q_push_before(&mq->dirty, sentinel, e);
- } else {
- sentinel = demote_sentinel(mq, e->level);
- q_push_before(&mq->clean, sentinel, e);
- }
+ if (e->dirty)
+ q_push(&mq->dirty, e);
+ else
+ q_push(&mq->clean, e);
}
-/*
- * Removes an entry from cache. Removes from the hash table.
- */
-static void __del(struct smq_policy *mq, struct queue *q, struct entry *e)
+// !h, !q, a -> h, q, a
+static void push(struct smq_policy *mq, struct entry *e)
{
- q_del(q, e);
- h_remove(&mq->table, e);
+ h_insert(&mq->table, e);
+ if (!e->pending_work)
+ push_queue(mq, e);
}
-static void del(struct smq_policy *mq, struct entry *e)
+static void push_queue_front(struct smq_policy *mq, struct entry *e)
{
- __del(mq, e->dirty ? &mq->dirty : &mq->clean, e);
+ if (e->dirty)
+ q_push_front(&mq->dirty, e);
+ else
+ q_push_front(&mq->clean, e);
}
-static struct entry *pop_old(struct smq_policy *mq, struct queue *q, unsigned max_level)
+static void push_front(struct smq_policy *mq, struct entry *e)
{
- struct entry *e = q_pop_old(q, max_level);
- if (e)
- h_remove(&mq->table, e);
- return e;
+ h_insert(&mq->table, e);
+ if (!e->pending_work)
+ push_queue_front(mq, e);
}
static dm_cblock_t infer_cblock(struct smq_policy *mq, struct entry *e)
@@ -978,16 +977,21 @@ static dm_cblock_t infer_cblock(struct smq_policy *mq, struct entry *e)
static void requeue(struct smq_policy *mq, struct entry *e)
{
- struct entry *sentinel;
+ /*
+ * Pending work has temporarily been taken out of the queues.
+ */
+ if (e->pending_work)
+ return;
if (!test_and_set_bit(from_cblock(infer_cblock(mq, e)), mq->cache_hit_bits)) {
- if (e->dirty) {
- sentinel = writeback_sentinel(mq, e->level);
- q_requeue_before(&mq->dirty, sentinel, e, 1u);
- } else {
- sentinel = demote_sentinel(mq, e->level);
- q_requeue_before(&mq->clean, sentinel, e, 1u);
+ if (!e->dirty) {
+ q_requeue(&mq->clean, e, 1u, NULL, NULL);
+ return;
}
+
+ q_requeue(&mq->dirty, e, 1u,
+ get_sentinel(&mq->writeback_sentinel_alloc, e->level, !mq->current_writeback_sentinels),
+ get_sentinel(&mq->writeback_sentinel_alloc, e->level, mq->current_writeback_sentinels));
}
}
@@ -1026,6 +1030,8 @@ static void update_promote_levels(struct smq_policy *mq)
unsigned threshold_level = allocator_empty(&mq->cache_alloc) ?
default_promote_level(mq) : (NR_HOTSPOT_LEVELS / 2u);
+ threshold_level = max(threshold_level, NR_HOTSPOT_LEVELS);
+
/*
* If the hotspot queue is performing badly then we have little
* confidence that we know which blocks to promote. So we cut down
@@ -1045,7 +1051,7 @@ static void update_promote_levels(struct smq_policy *mq)
}
mq->read_promote_level = NR_HOTSPOT_LEVELS - threshold_level;
- mq->write_promote_level = (NR_HOTSPOT_LEVELS - threshold_level) + 2u;
+ mq->write_promote_level = (NR_HOTSPOT_LEVELS - threshold_level);
}
/*
@@ -1095,34 +1101,144 @@ static void end_cache_period(struct smq_policy *mq)
}
}
-static int demote_cblock(struct smq_policy *mq,
- struct policy_locker *locker,
- dm_oblock_t *oblock)
+/*----------------------------------------------------------------*/
+
+/*
+ * Targets are given as a percentage.
+ */
+#define CLEAN_TARGET 25u
+#define FREE_TARGET 25u
+
+static unsigned percent_to_target(struct smq_policy *mq, unsigned p)
{
- struct entry *demoted = q_peek(&mq->clean, mq->clean.nr_levels, false);
- if (!demoted)
+ return from_cblock(mq->cache_size) * p / 100u;
+}
+
+static bool clean_target_met(struct smq_policy *mq, bool idle)
+{
+ /*
+ * Cache entries may not be populated. So we cannot rely on the
+ * size of the clean queue.
+ */
+ if (idle) {
/*
- * We could get a block from mq->dirty, but that
- * would add extra latency to the triggering bio as it
- * waits for the writeback. Better to not promote this
- * time and hope there's a clean block next time this block
- * is hit.
+ * We'd like to clean everything.
*/
- return -ENOSPC;
+ return q_size(&mq->dirty) == 0u;
+ }
+
+ /*
+ * If we're busy we don't worry about cleaning at all.
+ */
+ return true;
+}
- if (locker->fn(locker, demoted->oblock))
+static bool free_target_met(struct smq_policy *mq)
+{
+ unsigned nr_free;
+
+ nr_free = from_cblock(mq->cache_size) - mq->cache_alloc.nr_allocated;
+ return (nr_free + btracker_nr_demotions_queued(mq->bg_work)) >=
+ percent_to_target(mq, FREE_TARGET);
+}
+
+/*----------------------------------------------------------------*/
+
+static void mark_pending(struct smq_policy *mq, struct entry *e)
+{
+ BUG_ON(e->sentinel);
+ BUG_ON(!e->allocated);
+ BUG_ON(e->pending_work);
+ e->pending_work = true;
+}
+
+static void clear_pending(struct smq_policy *mq, struct entry *e)
+{
+ BUG_ON(!e->pending_work);
+ e->pending_work = false;
+}
+
+static void queue_writeback(struct smq_policy *mq)
+{
+ int r;
+ struct policy_work work;
+ struct entry *e;
+
+ e = q_peek(&mq->dirty, mq->dirty.nr_levels, !mq->migrations_allowed);
+ if (e) {
+ mark_pending(mq, e);
+ q_del(&mq->dirty, e);
+
+ work.op = POLICY_WRITEBACK;
+ work.oblock = e->oblock;
+ work.cblock = infer_cblock(mq, e);
+
+ r = btracker_queue(mq->bg_work, &work, NULL);
+ WARN_ON_ONCE(r); // FIXME: finish, I think we have to get rid of this race.
+ }
+}
+
+static void queue_demotion(struct smq_policy *mq)
+{
+ struct policy_work work;
+ struct entry *e;
+
+ if (unlikely(WARN_ON_ONCE(!mq->migrations_allowed)))
+ return;
+
+ e = q_peek(&mq->clean, mq->clean.nr_levels / 2, true);
+ if (!e) {
+ if (!clean_target_met(mq, true))
+ queue_writeback(mq);
+ return;
+ }
+
+ mark_pending(mq, e);
+ q_del(&mq->clean, e);
+
+ work.op = POLICY_DEMOTE;
+ work.oblock = e->oblock;
+ work.cblock = infer_cblock(mq, e);
+ btracker_queue(mq->bg_work, &work, NULL);
+}
+
+static void queue_promotion(struct smq_policy *mq, dm_oblock_t oblock,
+ struct policy_work **workp)
+{
+ struct entry *e;
+ struct policy_work work;
+
+ if (!mq->migrations_allowed)
+ return;
+
+ if (allocator_empty(&mq->cache_alloc)) {
/*
- * We couldn't lock this block.
+ * We always claim to be 'idle' to ensure some demotions happen
+ * with continuous loads.
*/
- return -EBUSY;
+ if (!free_target_met(mq))
+ queue_demotion(mq);
+ return;
+ }
- del(mq, demoted);
- *oblock = demoted->oblock;
- free_entry(&mq->cache_alloc, demoted);
+ if (btracker_promotion_already_present(mq->bg_work, oblock))
+ return;
- return 0;
+ /*
+ * We allocate the entry now to reserve the cblock. If the
+ * background work is aborted we must remember to free it.
+ */
+ e = alloc_entry(&mq->cache_alloc);
+ BUG_ON(!e);
+ e->pending_work = true;
+ work.op = POLICY_PROMOTE;
+ work.oblock = oblock;
+ work.cblock = infer_cblock(mq, e);
+ btracker_queue(mq->bg_work, &work, workp);
}
+/*----------------------------------------------------------------*/
+
enum promote_result {
PROMOTE_NOT,
PROMOTE_TEMPORARY,
@@ -1137,49 +1253,18 @@ static enum promote_result maybe_promote(bool promote)
return promote ? PROMOTE_PERMANENT : PROMOTE_NOT;
}
-static enum promote_result should_promote(struct smq_policy *mq, struct entry *hs_e, struct bio *bio,
- bool fast_promote)
+static enum promote_result should_promote(struct smq_policy *mq, struct entry *hs_e,
+ int data_dir, bool fast_promote)
{
- if (bio_data_dir(bio) == WRITE) {
+ if (data_dir == WRITE) {
if (!allocator_empty(&mq->cache_alloc) && fast_promote)
return PROMOTE_TEMPORARY;
- else
- return maybe_promote(hs_e->level >= mq->write_promote_level);
+ return maybe_promote(hs_e->level >= mq->write_promote_level);
} else
return maybe_promote(hs_e->level >= mq->read_promote_level);
}
-static void insert_in_cache(struct smq_policy *mq, dm_oblock_t oblock,
- struct policy_locker *locker,
- struct policy_result *result, enum promote_result pr)
-{
- int r;
- struct entry *e;
-
- if (allocator_empty(&mq->cache_alloc)) {
- result->op = POLICY_REPLACE;
- r = demote_cblock(mq, locker, &result->old_oblock);
- if (r) {
- result->op = POLICY_MISS;
- return;
- }
-
- } else
- result->op = POLICY_NEW;
-
- e = alloc_entry(&mq->cache_alloc);
- BUG_ON(!e);
- e->oblock = oblock;
-
- if (pr == PROMOTE_TEMPORARY)
- push(mq, e);
- else
- push_new(mq, e);
-
- result->cblock = infer_cblock(mq, e);
-}
-
static dm_oblock_t to_hblock(struct smq_policy *mq, dm_oblock_t b)
{
sector_t r = from_oblock(b);
@@ -1187,7 +1272,7 @@ static dm_oblock_t to_hblock(struct smq_policy *mq, dm_oblock_t b)
return to_oblock(r);
}
-static struct entry *update_hotspot_queue(struct smq_policy *mq, dm_oblock_t b, struct bio *bio)
+static struct entry *update_hotspot_queue(struct smq_policy *mq, dm_oblock_t b)
{
unsigned hi;
dm_oblock_t hb = to_hblock(mq, b);
@@ -1199,7 +1284,8 @@ static struct entry *update_hotspot_queue(struct smq_policy *mq, dm_oblock_t b,
hi = get_index(&mq->hotspot_alloc, e);
q_requeue(&mq->hotspot, e,
test_and_set_bit(hi, mq->hotspot_hit_bits) ?
- 0u : mq->hotspot_level_jump);
+ 0u : mq->hotspot_level_jump,
+ NULL, NULL);
} else {
stats_miss(&mq->hotspot_stats);
@@ -1225,47 +1311,6 @@ static struct entry *update_hotspot_queue(struct smq_policy *mq, dm_oblock_t b,
return e;
}
-/*
- * Looks the oblock up in the hash table, then decides whether to put in
- * pre_cache, or cache etc.
- */
-static int map(struct smq_policy *mq, struct bio *bio, dm_oblock_t oblock,
- bool can_migrate, bool fast_promote,
- struct policy_locker *locker, struct policy_result *result)
-{
- struct entry *e, *hs_e;
- enum promote_result pr;
-
- hs_e = update_hotspot_queue(mq, oblock, bio);
-
- e = h_lookup(&mq->table, oblock);
- if (e) {
- stats_level_accessed(&mq->cache_stats, e->level);
-
- requeue(mq, e);
- result->op = POLICY_HIT;
- result->cblock = infer_cblock(mq, e);
-
- } else {
- stats_miss(&mq->cache_stats);
-
- pr = should_promote(mq, hs_e, bio, fast_promote);
- if (pr == PROMOTE_NOT)
- result->op = POLICY_MISS;
-
- else {
- if (!can_migrate) {
- result->op = POLICY_MISS;
- return -EWOULDBLOCK;
- }
-
- insert_in_cache(mq, oblock, locker, result, pr);
- }
- }
-
- return 0;
-}
-
/*----------------------------------------------------------------*/
/*
@@ -1282,6 +1327,7 @@ static void smq_destroy(struct dm_cache_policy *p)
{
struct smq_policy *mq = to_smq_policy(p);
+ btracker_destroy(mq->bg_work);
h_exit(&mq->hotspot_table);
h_exit(&mq->table);
free_bitset(mq->hotspot_hit_bits);
@@ -1290,234 +1336,244 @@ static void smq_destroy(struct dm_cache_policy *p)
kfree(mq);
}
-static int smq_map(struct dm_cache_policy *p, dm_oblock_t oblock,
- bool can_block, bool can_migrate, bool fast_promote,
- struct bio *bio, struct policy_locker *locker,
- struct policy_result *result)
-{
- int r;
- unsigned long flags;
- struct smq_policy *mq = to_smq_policy(p);
-
- result->op = POLICY_MISS;
-
- spin_lock_irqsave(&mq->lock, flags);
- r = map(mq, bio, oblock, can_migrate, fast_promote, locker, result);
- spin_unlock_irqrestore(&mq->lock, flags);
-
- return r;
-}
+/*----------------------------------------------------------------*/
-static int smq_lookup(struct dm_cache_policy *p, dm_oblock_t oblock, dm_cblock_t *cblock)
+static int __lookup(struct smq_policy *mq, dm_oblock_t oblock, dm_cblock_t *cblock,
+ int data_dir, bool fast_copy,
+ struct policy_work **work, bool *background_work)
{
- int r;
- unsigned long flags;
- struct smq_policy *mq = to_smq_policy(p);
- struct entry *e;
+ struct entry *e, *hs_e;
+ enum promote_result pr;
+
+ *background_work = false;
- spin_lock_irqsave(&mq->lock, flags);
e = h_lookup(&mq->table, oblock);
if (e) {
+ stats_level_accessed(&mq->cache_stats, e->level);
+
+ requeue(mq, e);
*cblock = infer_cblock(mq, e);
- r = 0;
- } else
- r = -ENOENT;
- spin_unlock_irqrestore(&mq->lock, flags);
+ return 0;
- return r;
-}
+ } else {
+ stats_miss(&mq->cache_stats);
-static void __smq_set_clear_dirty(struct smq_policy *mq, dm_oblock_t oblock, bool set)
-{
- struct entry *e;
+ /*
+ * The hotspot queue only gets updated with misses.
+ */
+ hs_e = update_hotspot_queue(mq, oblock);
- e = h_lookup(&mq->table, oblock);
- BUG_ON(!e);
+ pr = should_promote(mq, hs_e, data_dir, fast_copy);
+ if (pr != PROMOTE_NOT) {
+ queue_promotion(mq, oblock, work);
+ *background_work = true;
+ }
- del(mq, e);
- e->dirty = set;
- push(mq, e);
+ return -ENOENT;
+ }
}
-static void smq_set_dirty(struct dm_cache_policy *p, dm_oblock_t oblock)
+static int smq_lookup(struct dm_cache_policy *p, dm_oblock_t oblock, dm_cblock_t *cblock,
+ int data_dir, bool fast_copy,
+ bool *background_work)
{
+ int r;
unsigned long flags;
struct smq_policy *mq = to_smq_policy(p);
spin_lock_irqsave(&mq->lock, flags);
- __smq_set_clear_dirty(mq, oblock, true);
+ r = __lookup(mq, oblock, cblock,
+ data_dir, fast_copy,
+ NULL, background_work);
spin_unlock_irqrestore(&mq->lock, flags);
+
+ return r;
}
-static void smq_clear_dirty(struct dm_cache_policy *p, dm_oblock_t oblock)
+static int smq_lookup_with_work(struct dm_cache_policy *p,
+ dm_oblock_t oblock, dm_cblock_t *cblock,
+ int data_dir, bool fast_copy,
+ struct policy_work **work)
{
- struct smq_policy *mq = to_smq_policy(p);
+ int r;
+ bool background_queued;
unsigned long flags;
+ struct smq_policy *mq = to_smq_policy(p);
spin_lock_irqsave(&mq->lock, flags);
- __smq_set_clear_dirty(mq, oblock, false);
+ r = __lookup(mq, oblock, cblock, data_dir, fast_copy, work, &background_queued);
spin_unlock_irqrestore(&mq->lock, flags);
-}
-static unsigned random_level(dm_cblock_t cblock)
-{
- return hash_32(from_cblock(cblock), 9) & (NR_CACHE_LEVELS - 1);
+ return r;
}
-static int smq_load_mapping(struct dm_cache_policy *p,
- dm_oblock_t oblock, dm_cblock_t cblock,
- uint32_t hint, bool hint_valid)
+static int smq_get_background_work(struct dm_cache_policy *p, bool idle,
+ struct policy_work **result)
{
+ int r;
+ unsigned long flags;
struct smq_policy *mq = to_smq_policy(p);
- struct entry *e;
- e = alloc_particular_entry(&mq->cache_alloc, from_cblock(cblock));
- e->oblock = oblock;
- e->dirty = false; /* this gets corrected in a minute */
- e->level = hint_valid ? min(hint, NR_CACHE_LEVELS - 1) : random_level(cblock);
- push(mq, e);
+ spin_lock_irqsave(&mq->lock, flags);
+ r = btracker_issue(mq->bg_work, result);
+ if (r == -ENODATA) {
+ if (!clean_target_met(mq, idle)) {
+ queue_writeback(mq);
+ r = btracker_issue(mq->bg_work, result);
+ }
+ }
+ spin_unlock_irqrestore(&mq->lock, flags);
- return 0;
+ return r;
}
-static uint32_t smq_get_hint(struct dm_cache_policy *p, dm_cblock_t cblock)
-{
- struct smq_policy *mq = to_smq_policy(p);
- struct entry *e = get_entry(&mq->cache_alloc, from_cblock(cblock));
-
- if (!e->allocated)
- return 0;
-
- return e->level;
-}
+/*
+ * We need to clear any pending work flags that have been set, and in the
+ * case of promotion free the entry for the destination cblock.
+ */
+static void __complete_background_work(struct smq_policy *mq,
+ struct policy_work *work,
+ bool success)
+{
+ struct entry *e = get_entry(&mq->cache_alloc,
+ from_cblock(work->cblock));
+
+ switch (work->op) {
+ case POLICY_PROMOTE:
+ // !h, !q, a
+ clear_pending(mq, e);
+ if (success) {
+ e->oblock = work->oblock;
+ e->level = NR_CACHE_LEVELS - 1;
+ push(mq, e);
+ // h, q, a
+ } else {
+ free_entry(&mq->cache_alloc, e);
+ // !h, !q, !a
+ }
+ break;
-static void __remove_mapping(struct smq_policy *mq, dm_oblock_t oblock)
-{
- struct entry *e;
+ case POLICY_DEMOTE:
+ // h, !q, a
+ if (success) {
+ h_remove(&mq->table, e);
+ free_entry(&mq->cache_alloc, e);
+ // !h, !q, !a
+ } else {
+ clear_pending(mq, e);
+ push_queue(mq, e);
+ // h, q, a
+ }
+ break;
- e = h_lookup(&mq->table, oblock);
- BUG_ON(!e);
+ case POLICY_WRITEBACK:
+ // h, !q, a
+ clear_pending(mq, e);
+ push_queue(mq, e);
+ // h, q, a
+ break;
+ }
- del(mq, e);
- free_entry(&mq->cache_alloc, e);
+ btracker_complete(mq->bg_work, work);
}
-static void smq_remove_mapping(struct dm_cache_policy *p, dm_oblock_t oblock)
+static void smq_complete_background_work(struct dm_cache_policy *p,
+ struct policy_work *work,
+ bool success)
{
- struct smq_policy *mq = to_smq_policy(p);
unsigned long flags;
+ struct smq_policy *mq = to_smq_policy(p);
spin_lock_irqsave(&mq->lock, flags);
- __remove_mapping(mq, oblock);
+ __complete_background_work(mq, work, success);
spin_unlock_irqrestore(&mq->lock, flags);
}
-static int __remove_cblock(struct smq_policy *mq, dm_cblock_t cblock)
+// in_hash(oblock) -> in_hash(oblock)
+static void __smq_set_clear_dirty(struct smq_policy *mq, dm_cblock_t cblock, bool set)
{
struct entry *e = get_entry(&mq->cache_alloc, from_cblock(cblock));
- if (!e || !e->allocated)
- return -ENODATA;
-
- del(mq, e);
- free_entry(&mq->cache_alloc, e);
-
- return 0;
+ if (e->pending_work)
+ e->dirty = set;
+ else {
+ del_queue(mq, e);
+ e->dirty = set;
+ push_queue(mq, e);
+ }
}
-static int smq_remove_cblock(struct dm_cache_policy *p, dm_cblock_t cblock)
+static void smq_set_dirty(struct dm_cache_policy *p, dm_cblock_t cblock)
{
- int r;
unsigned long flags;
struct smq_policy *mq = to_smq_policy(p);
spin_lock_irqsave(&mq->lock, flags);
- r = __remove_cblock(mq, cblock);
+ __smq_set_clear_dirty(mq, cblock, true);
spin_unlock_irqrestore(&mq->lock, flags);
-
- return r;
}
-
-#define CLEAN_TARGET_CRITICAL 5u /* percent */
-
-static bool clean_target_met(struct smq_policy *mq, bool critical)
+static void smq_clear_dirty(struct dm_cache_policy *p, dm_cblock_t cblock)
{
- if (critical) {
- /*
- * Cache entries may not be populated. So we're cannot rely on the
- * size of the clean queue.
- */
- unsigned nr_clean = from_cblock(mq->cache_size) - q_size(&mq->dirty);
- unsigned target = from_cblock(mq->cache_size) * CLEAN_TARGET_CRITICAL / 100u;
+ struct smq_policy *mq = to_smq_policy(p);
+ unsigned long flags;
- return nr_clean >= target;
- } else
- return !q_size(&mq->dirty);
+ spin_lock_irqsave(&mq->lock, flags);
+ __smq_set_clear_dirty(mq, cblock, false);
+ spin_unlock_irqrestore(&mq->lock, flags);
}
-static int __smq_writeback_work(struct smq_policy *mq, dm_oblock_t *oblock,
- dm_cblock_t *cblock, bool critical_only)
+static unsigned random_level(dm_cblock_t cblock)
{
- struct entry *e = NULL;
- bool target_met = clean_target_met(mq, critical_only);
-
- if (critical_only)
- /*
- * Always try and keep the bottom level clean.
- */
- e = pop_old(mq, &mq->dirty, target_met ? 1u : mq->dirty.nr_levels);
+ return hash_32(from_cblock(cblock), 9) & (NR_CACHE_LEVELS - 1);
+}
- else
- e = pop_old(mq, &mq->dirty, mq->dirty.nr_levels);
+static int smq_load_mapping(struct dm_cache_policy *p,
+ dm_oblock_t oblock, dm_cblock_t cblock,
+ bool dirty, uint32_t hint, bool hint_valid)
+{
+ struct smq_policy *mq = to_smq_policy(p);
+ struct entry *e;
- if (!e)
- return -ENODATA;
+ e = alloc_particular_entry(&mq->cache_alloc, from_cblock(cblock));
+ e->oblock = oblock;
+ e->dirty = dirty;
+ e->level = hint_valid ? min(hint, NR_CACHE_LEVELS - 1) : random_level(cblock);
+ e->pending_work = false;
- *oblock = e->oblock;
- *cblock = infer_cblock(mq, e);
- e->dirty = false;
- push_new(mq, e);
+ /*
+ * When we load mappings we push ahead of both sentinels in order to
+ * allow demotions and cleaning to occur immediately.
+ */
+ push_front(mq, e);
return 0;
}
-static int smq_writeback_work(struct dm_cache_policy *p, dm_oblock_t *oblock,
- dm_cblock_t *cblock, bool critical_only)
+static int smq_invalidate_mapping(struct dm_cache_policy *p, dm_cblock_t cblock)
{
- int r;
- unsigned long flags;
struct smq_policy *mq = to_smq_policy(p);
+ struct entry *e = get_entry(&mq->cache_alloc, from_cblock(cblock));
- spin_lock_irqsave(&mq->lock, flags);
- r = __smq_writeback_work(mq, oblock, cblock, critical_only);
- spin_unlock_irqrestore(&mq->lock, flags);
-
- return r;
-}
-
-static void __force_mapping(struct smq_policy *mq,
- dm_oblock_t current_oblock, dm_oblock_t new_oblock)
-{
- struct entry *e = h_lookup(&mq->table, current_oblock);
+ if (!e->allocated)
+ return -ENODATA;
- if (e) {
- del(mq, e);
- e->oblock = new_oblock;
- e->dirty = true;
- push(mq, e);
- }
+ // FIXME: what if this block has pending background work?
+ del_queue(mq, e);
+ h_remove(&mq->table, e);
+ free_entry(&mq->cache_alloc, e);
+ return 0;
}
-static void smq_force_mapping(struct dm_cache_policy *p,
- dm_oblock_t current_oblock, dm_oblock_t new_oblock)
+static uint32_t smq_get_hint(struct dm_cache_policy *p, dm_cblock_t cblock)
{
- unsigned long flags;
struct smq_policy *mq = to_smq_policy(p);
+ struct entry *e = get_entry(&mq->cache_alloc, from_cblock(cblock));
- spin_lock_irqsave(&mq->lock, flags);
- __force_mapping(mq, current_oblock, new_oblock);
- spin_unlock_irqrestore(&mq->lock, flags);
+ if (!e->allocated)
+ return 0;
+
+ return e->level;
}
static dm_cblock_t smq_residency(struct dm_cache_policy *p)
@@ -1546,6 +1602,12 @@ static void smq_tick(struct dm_cache_policy *p, bool can_block)
spin_unlock_irqrestore(&mq->lock, flags);
}
+static void smq_allow_migrations(struct dm_cache_policy *p, bool allow)
+{
+ struct smq_policy *mq = to_smq_policy(p);
+ mq->migrations_allowed = allow;
+}
+
/*
* smq has no config values, but the old mq policy did. To avoid breaking
* software we continue to accept these configurables for the mq policy,
@@ -1590,18 +1652,18 @@ static int mq_emit_config_values(struct dm_cache_policy *p, char *result,
static void init_policy_functions(struct smq_policy *mq, bool mimic_mq)
{
mq->policy.destroy = smq_destroy;
- mq->policy.map = smq_map;
mq->policy.lookup = smq_lookup;
+ mq->policy.lookup_with_work = smq_lookup_with_work;
+ mq->policy.get_background_work = smq_get_background_work;
+ mq->policy.complete_background_work = smq_complete_background_work;
mq->policy.set_dirty = smq_set_dirty;
mq->policy.clear_dirty = smq_clear_dirty;
mq->policy.load_mapping = smq_load_mapping;
+ mq->policy.invalidate_mapping = smq_invalidate_mapping;
mq->policy.get_hint = smq_get_hint;
- mq->policy.remove_mapping = smq_remove_mapping;
- mq->policy.remove_cblock = smq_remove_cblock;
- mq->policy.writeback_work = smq_writeback_work;
- mq->policy.force_mapping = smq_force_mapping;
mq->policy.residency = smq_residency;
mq->policy.tick = smq_tick;
+ mq->policy.allow_migrations = smq_allow_migrations;
if (mimic_mq) {
mq->policy.set_config_value = mq_set_config_value;
@@ -1633,7 +1695,8 @@ static void calc_hotspot_params(sector_t origin_size,
static struct dm_cache_policy *__smq_create(dm_cblock_t cache_size,
sector_t origin_size,
sector_t cache_block_size,
- bool mimic_mq)
+ bool mimic_mq,
+ bool migrations_allowed)
{
unsigned i;
unsigned nr_sentinels_per_queue = 2u * NR_CACHE_LEVELS;
@@ -1658,11 +1721,11 @@ static struct dm_cache_policy *__smq_create(dm_cblock_t cache_size,
}
init_allocator(&mq->writeback_sentinel_alloc, &mq->es, 0, nr_sentinels_per_queue);
- for (i = 0; i < nr_sentinels_per_queue; i++)
+ for (i = 0; i < nr_sentinels_per_queue; i++)
get_entry(&mq->writeback_sentinel_alloc, i)->sentinel = true;
init_allocator(&mq->demote_sentinel_alloc, &mq->es, nr_sentinels_per_queue, total_sentinels);
- for (i = 0; i < nr_sentinels_per_queue; i++)
+ for (i = 0; i < nr_sentinels_per_queue; i++)
get_entry(&mq->demote_sentinel_alloc, i)->sentinel = true;
init_allocator(&mq->hotspot_alloc, &mq->es, total_sentinels,
@@ -1715,8 +1778,16 @@ static struct dm_cache_policy *__smq_create(dm_cblock_t cache_size,
mq->next_hotspot_period = jiffies;
mq->next_cache_period = jiffies;
+ mq->bg_work = btracker_create(10240); /* FIXME: hard coded value */
+ if (!mq->bg_work)
+ goto bad_btracker;
+
+ mq->migrations_allowed = migrations_allowed;
+
return &mq->policy;
+bad_btracker:
+ h_exit(&mq->hotspot_table);
bad_alloc_hotspot_table:
h_exit(&mq->table);
bad_alloc_table:
@@ -1735,21 +1806,28 @@ static struct dm_cache_policy *smq_create(dm_cblock_t cache_size,
sector_t origin_size,
sector_t cache_block_size)
{
- return __smq_create(cache_size, origin_size, cache_block_size, false);
+ return __smq_create(cache_size, origin_size, cache_block_size, false, true);
}
static struct dm_cache_policy *mq_create(dm_cblock_t cache_size,
sector_t origin_size,
sector_t cache_block_size)
{
- return __smq_create(cache_size, origin_size, cache_block_size, true);
+ return __smq_create(cache_size, origin_size, cache_block_size, true, true);
+}
+
+static struct dm_cache_policy *cleaner_create(dm_cblock_t cache_size,
+ sector_t origin_size,
+ sector_t cache_block_size)
+{
+ return __smq_create(cache_size, origin_size, cache_block_size, false, false);
}
/*----------------------------------------------------------------*/
static struct dm_cache_policy_type smq_policy_type = {
.name = "smq",
- .version = {1, 5, 0},
+ .version = {2, 0, 0},
.hint_size = 4,
.owner = THIS_MODULE,
.create = smq_create
@@ -1757,15 +1835,23 @@ static struct dm_cache_policy_type smq_policy_type = {
static struct dm_cache_policy_type mq_policy_type = {
.name = "mq",
- .version = {1, 5, 0},
+ .version = {2, 0, 0},
.hint_size = 4,
.owner = THIS_MODULE,
.create = mq_create,
};
+static struct dm_cache_policy_type cleaner_policy_type = {
+ .name = "cleaner",
+ .version = {2, 0, 0},
+ .hint_size = 4,
+ .owner = THIS_MODULE,
+ .create = cleaner_create,
+};
+
static struct dm_cache_policy_type default_policy_type = {
.name = "default",
- .version = {1, 5, 0},
+ .version = {2, 0, 0},
.hint_size = 4,
.owner = THIS_MODULE,
.create = smq_create,
@@ -1785,23 +1871,36 @@ static int __init smq_init(void)
r = dm_cache_policy_register(&mq_policy_type);
if (r) {
DMERR("register failed (as mq) %d", r);
- dm_cache_policy_unregister(&smq_policy_type);
- return -ENOMEM;
+ goto out_mq;
+ }
+
+ r = dm_cache_policy_register(&cleaner_policy_type);
+ if (r) {
+ DMERR("register failed (as cleaner) %d", r);
+ goto out_cleaner;
}
r = dm_cache_policy_register(&default_policy_type);
if (r) {
DMERR("register failed (as default) %d", r);
- dm_cache_policy_unregister(&mq_policy_type);
- dm_cache_policy_unregister(&smq_policy_type);
- return -ENOMEM;
+ goto out_default;
}
return 0;
+
+out_default:
+ dm_cache_policy_unregister(&cleaner_policy_type);
+out_cleaner:
+ dm_cache_policy_unregister(&mq_policy_type);
+out_mq:
+ dm_cache_policy_unregister(&smq_policy_type);
+
+ return -ENOMEM;
}
static void __exit smq_exit(void)
{
+ dm_cache_policy_unregister(&cleaner_policy_type);
dm_cache_policy_unregister(&smq_policy_type);
dm_cache_policy_unregister(&mq_policy_type);
dm_cache_policy_unregister(&default_policy_type);
@@ -1816,3 +1915,4 @@ MODULE_DESCRIPTION("smq cache policy");
MODULE_ALIAS("dm-cache-default");
MODULE_ALIAS("dm-cache-mq");
+MODULE_ALIAS("dm-cache-cleaner");
diff --git a/drivers/md/dm-cache-policy.h b/drivers/md/dm-cache-policy.h
index aa10b1493f34..c05fc3436cef 100644
--- a/drivers/md/dm-cache-policy.h
+++ b/drivers/md/dm-cache-policy.h
@@ -13,183 +13,100 @@
/*----------------------------------------------------------------*/
-/* FIXME: make it clear which methods are optional. Get debug policy to
- * double check this at start.
- */
-
/*
* The cache policy makes the important decisions about which blocks get to
* live on the faster cache device.
- *
- * When the core target has to remap a bio it calls the 'map' method of the
- * policy. This returns an instruction telling the core target what to do.
- *
- * POLICY_HIT:
- * That block is in the cache. Remap to the cache and carry on.
- *
- * POLICY_MISS:
- * This block is on the origin device. Remap and carry on.
- *
- * POLICY_NEW:
- * This block is currently on the origin device, but the policy wants to
- * move it. The core should:
- *
- * - hold any further io to this origin block
- * - copy the origin to the given cache block
- * - release all the held blocks
- * - remap the original block to the cache
- *
- * POLICY_REPLACE:
- * This block is currently on the origin device. The policy wants to
- * move it to the cache, with the added complication that the destination
- * cache block needs a writeback first. The core should:
- *
- * - hold any further io to this origin block
- * - hold any further io to the origin block that's being written back
- * - writeback
- * - copy new block to cache
- * - release held blocks
- * - remap bio to cache and reissue.
- *
- * Should the core run into trouble while processing a POLICY_NEW or
- * POLICY_REPLACE instruction it will roll back the policies mapping using
- * remove_mapping() or force_mapping(). These methods must not fail. This
- * approach avoids having transactional semantics in the policy (ie, the
- * core informing the policy when a migration is complete), and hence makes
- * it easier to write new policies.
- *
- * In general policy methods should never block, except in the case of the
- * map function when can_migrate is set. So be careful to implement using
- * bounded, preallocated memory.
*/
enum policy_operation {
- POLICY_HIT,
- POLICY_MISS,
- POLICY_NEW,
- POLICY_REPLACE
-};
-
-/*
- * When issuing a POLICY_REPLACE the policy needs to make a callback to
- * lock the block being demoted. This doesn't need to occur during a
- * writeback operation since the block remains in the cache.
- */
-struct policy_locker;
-typedef int (*policy_lock_fn)(struct policy_locker *l, dm_oblock_t oblock);
-
-struct policy_locker {
- policy_lock_fn fn;
+ POLICY_PROMOTE,
+ POLICY_DEMOTE,
+ POLICY_WRITEBACK
};
/*
* This is the instruction passed back to the core target.
*/
-struct policy_result {
+struct policy_work {
enum policy_operation op;
- dm_oblock_t old_oblock; /* POLICY_REPLACE */
- dm_cblock_t cblock; /* POLICY_HIT, POLICY_NEW, POLICY_REPLACE */
+ dm_oblock_t oblock;
+ dm_cblock_t cblock;
};
/*
- * The cache policy object. Just a bunch of methods. It is envisaged that
- * this structure will be embedded in a bigger, policy specific structure
- * (ie. use container_of()).
+ * The cache policy object. It is envisaged that this structure will be
+ * embedded in a bigger, policy specific structure (ie. use container_of()).
*/
struct dm_cache_policy {
-
- /*
- * FIXME: make it clear which methods are optional, and which may
- * block.
- */
-
/*
* Destroys this object.
*/
void (*destroy)(struct dm_cache_policy *p);
/*
- * See large comment above.
- *
- * oblock - the origin block we're interested in.
- *
- * can_block - indicates whether the current thread is allowed to
- * block. -EWOULDBLOCK returned if it can't and would.
- *
- * can_migrate - gives permission for POLICY_NEW or POLICY_REPLACE
- * instructions. If denied and the policy would have
- * returned one of these instructions it should
- * return -EWOULDBLOCK.
+ * Find the location of a block.
*
- * discarded_oblock - indicates whether the whole origin block is
- * in a discarded state (FIXME: better to tell the
- * policy about this sooner, so it can recycle that
- * cache block if it wants.)
- * bio - the bio that triggered this call.
- * result - gets filled in with the instruction.
+ * Must not block.
*
- * May only return 0, or -EWOULDBLOCK (if !can_migrate)
+ * Returns 0 if in cache (cblock will be set), -ENOENT if not, < 0 for
+ * other errors (-EWOULDBLOCK would be typical). data_dir should be
+ * READ or WRITE. fast_copy should be set if migrating this block would
+ * be 'cheap' somehow (eg, discarded data). background_queued will be set
+ * if a migration has just been queued.
*/
- int (*map)(struct dm_cache_policy *p, dm_oblock_t oblock,
- bool can_block, bool can_migrate, bool discarded_oblock,
- struct bio *bio, struct policy_locker *locker,
- struct policy_result *result);
+ int (*lookup)(struct dm_cache_policy *p, dm_oblock_t oblock, dm_cblock_t *cblock,
+ int data_dir, bool fast_copy, bool *background_queued);
/*
- * Sometimes we want to see if a block is in the cache, without
- * triggering any update of stats. (ie. it's not a real hit).
- *
- * Must not block.
+ * Sometimes the core target can optimise a migration, eg, the
+ * block may be discarded, or the bio may cover an entire block.
+ * In order to optimise it needs the migration immediately though
+ * so it knows to do something different with the bio.
*
- * Returns 0 if in cache, -ENOENT if not, < 0 for other errors
- * (-EWOULDBLOCK would be typical).
+ * This method is optional (policy-internal will fallback to using
+ * lookup).
*/
- int (*lookup)(struct dm_cache_policy *p, dm_oblock_t oblock, dm_cblock_t *cblock);
-
- void (*set_dirty)(struct dm_cache_policy *p, dm_oblock_t oblock);
- void (*clear_dirty)(struct dm_cache_policy *p, dm_oblock_t oblock);
+ int (*lookup_with_work)(struct dm_cache_policy *p,
+ dm_oblock_t oblock, dm_cblock_t *cblock,
+ int data_dir, bool fast_copy,
+ struct policy_work **work);
/*
- * Called when a cache target is first created. Used to load a
- * mapping from the metadata device into the policy.
+ * Retrieves background work. Returns -ENODATA when there's no
+ * background work.
*/
- int (*load_mapping)(struct dm_cache_policy *p, dm_oblock_t oblock,
- dm_cblock_t cblock, uint32_t hint, bool hint_valid);
+ int (*get_background_work)(struct dm_cache_policy *p, bool idle,
+ struct policy_work **result);
/*
- * Gets the hint for a given cblock. Called in a single threaded
- * context. So no locking required.
+ * You must pass in the same work pointer that you were given, not
+ * a copy.
*/
- uint32_t (*get_hint)(struct dm_cache_policy *p, dm_cblock_t cblock);
+ void (*complete_background_work)(struct dm_cache_policy *p,
+ struct policy_work *work,
+ bool success);
+
+ void (*set_dirty)(struct dm_cache_policy *p, dm_cblock_t cblock);
+ void (*clear_dirty)(struct dm_cache_policy *p, dm_cblock_t cblock);
/*
- * Override functions used on the error paths of the core target.
- * They must succeed.
+ * Called when a cache target is first created. Used to load a
+ * mapping from the metadata device into the policy.
*/
- void (*remove_mapping)(struct dm_cache_policy *p, dm_oblock_t oblock);
- void (*force_mapping)(struct dm_cache_policy *p, dm_oblock_t current_oblock,
- dm_oblock_t new_oblock);
+ int (*load_mapping)(struct dm_cache_policy *p, dm_oblock_t oblock,
+ dm_cblock_t cblock, bool dirty,
+ uint32_t hint, bool hint_valid);
/*
- * This is called via the invalidate_cblocks message. It is
- * possible the particular cblock has already been removed due to a
- * write io in passthrough mode. In which case this should return
- * -ENODATA.
+ * Drops the mapping, irrespective of whether it's clean or dirty.
+ * Returns -ENODATA if cblock is not mapped.
*/
- int (*remove_cblock)(struct dm_cache_policy *p, dm_cblock_t cblock);
+ int (*invalidate_mapping)(struct dm_cache_policy *p, dm_cblock_t cblock);
/*
- * Provide a dirty block to be written back by the core target. If
- * critical_only is set then the policy should only provide work if
- * it urgently needs it.
- *
- * Returns:
- *
- * 0 and @cblock,@oblock: block to write back provided
- *
- * -ENODATA: no dirty blocks available
+ * Gets the hint for a given cblock. Called in a single threaded
+ * context. So no locking required.
*/
- int (*writeback_work)(struct dm_cache_policy *p, dm_oblock_t *oblock, dm_cblock_t *cblock,
- bool critical_only);
+ uint32_t (*get_hint)(struct dm_cache_policy *p, dm_cblock_t cblock);
/*
* How full is the cache?
@@ -202,6 +119,8 @@ struct dm_cache_policy {
* queue merging has occurred). To stop the policy being fooled by
* these, the core target sends regular tick() calls to the policy.
* The policy should only count an entry as hit once per tick.
+ *
+ * This method is optional.
*/
void (*tick)(struct dm_cache_policy *p, bool can_block);
@@ -213,6 +132,8 @@ struct dm_cache_policy {
int (*set_config_value)(struct dm_cache_policy *p,
const char *key, const char *value);
+ void (*allow_migrations)(struct dm_cache_policy *p, bool allow);
+
/*
* Book keeping ptr for the policy register, not for general use.
*/
diff --git a/drivers/md/dm-cache-target.c b/drivers/md/dm-cache-target.c
index e04c61e0839e..c5ea03fc7ee1 100644
--- a/drivers/md/dm-cache-target.c
+++ b/drivers/md/dm-cache-target.c
@@ -5,7 +5,7 @@
*/
#include "dm.h"
-#include "dm-bio-prison.h"
+#include "dm-bio-prison-v2.h"
#include "dm-bio-record.h"
#include "dm-cache-metadata.h"
@@ -15,6 +15,7 @@
#include <linux/init.h>
#include <linux/mempool.h>
#include <linux/module.h>
+#include <linux/rwsem.h>
#include <linux/slab.h>
#include <linux/vmalloc.h>
@@ -25,7 +26,18 @@ DECLARE_DM_KCOPYD_THROTTLE_WITH_MODULE_PARM(cache_copy_throttle,
/*----------------------------------------------------------------*/
-#define IOT_RESOLUTION 4
+/*
+ * Glossary:
+ *
+ * oblock: index of an origin block
+ * cblock: index of a cache block
+ * promotion: movement of a block from origin to cache
+ * demotion: movement of a block from cache to origin
+ * migration: movement of a block between the origin and cache device,
+ * either direction
+ */
+
+/*----------------------------------------------------------------*/
struct io_tracker {
spinlock_t lock;
@@ -82,6 +94,9 @@ static void iot_io_begin(struct io_tracker *iot, sector_t len)
static void __iot_io_end(struct io_tracker *iot, sector_t len)
{
+ if (!len)
+ return;
+
iot->in_flight -= len;
if (!iot->in_flight)
iot->idle_time = jiffies;
@@ -99,19 +114,177 @@ static void iot_io_end(struct io_tracker *iot, sector_t len)
/*----------------------------------------------------------------*/
/*
- * Glossary:
- *
- * oblock: index of an origin block
- * cblock: index of a cache block
- * promotion: movement of a block from origin to cache
- * demotion: movement of a block from cache to origin
- * migration: movement of a block between the origin and cache device,
- * either direction
+ * Represents a chunk of future work. 'input' allows continuations to pass
+ * values between themselves, typically error values.
*/
+struct continuation {
+ struct work_struct ws;
+ blk_status_t input;
+};
+
+static inline void init_continuation(struct continuation *k,
+ void (*fn)(struct work_struct *))
+{
+ INIT_WORK(&k->ws, fn);
+ k->input = 0;
+}
+
+static inline void queue_continuation(struct workqueue_struct *wq,
+ struct continuation *k)
+{
+ queue_work(wq, &k->ws);
+}
/*----------------------------------------------------------------*/
/*
+ * The batcher collects together pieces of work that need a particular
+ * operation to occur before they can proceed (typically a commit).
+ */
+struct batcher {
+ /*
+ * The operation that everyone is waiting for.
+ */
+ blk_status_t (*commit_op)(void *context);
+ void *commit_context;
+
+ /*
+ * This is how bios should be issued once the commit op is complete
+ * (accounted_request).
+ */
+ void (*issue_op)(struct bio *bio, void *context);
+ void *issue_context;
+
+ /*
+ * Queued work gets put on here after commit.
+ */
+ struct workqueue_struct *wq;
+
+ spinlock_t lock;
+ struct list_head work_items;
+ struct bio_list bios;
+ struct work_struct commit_work;
+
+ bool commit_scheduled;
+};
+
+static void __commit(struct work_struct *_ws)
+{
+ struct batcher *b = container_of(_ws, struct batcher, commit_work);
+ blk_status_t r;
+ unsigned long flags;
+ struct list_head work_items;
+ struct work_struct *ws, *tmp;
+ struct continuation *k;
+ struct bio *bio;
+ struct bio_list bios;
+
+ INIT_LIST_HEAD(&work_items);
+ bio_list_init(&bios);
+
+ /*
+ * We have to grab these before the commit_op to avoid a race
+ * condition.
+ */
+ spin_lock_irqsave(&b->lock, flags);
+ list_splice_init(&b->work_items, &work_items);
+ bio_list_merge(&bios, &b->bios);
+ bio_list_init(&b->bios);
+ b->commit_scheduled = false;
+ spin_unlock_irqrestore(&b->lock, flags);
+
+ r = b->commit_op(b->commit_context);
+
+ list_for_each_entry_safe(ws, tmp, &work_items, entry) {
+ k = container_of(ws, struct continuation, ws);
+ k->input = r;
+ INIT_LIST_HEAD(&ws->entry); /* to avoid a WARN_ON */
+ queue_work(b->wq, ws);
+ }
+
+ while ((bio = bio_list_pop(&bios))) {
+ if (r) {
+ bio->bi_status = r;
+ bio_endio(bio);
+ } else
+ b->issue_op(bio, b->issue_context);
+ }
+}
+
+static void batcher_init(struct batcher *b,
+ blk_status_t (*commit_op)(void *),
+ void *commit_context,
+ void (*issue_op)(struct bio *bio, void *),
+ void *issue_context,
+ struct workqueue_struct *wq)
+{
+ b->commit_op = commit_op;
+ b->commit_context = commit_context;
+ b->issue_op = issue_op;
+ b->issue_context = issue_context;
+ b->wq = wq;
+
+ spin_lock_init(&b->lock);
+ INIT_LIST_HEAD(&b->work_items);
+ bio_list_init(&b->bios);
+ INIT_WORK(&b->commit_work, __commit);
+ b->commit_scheduled = false;
+}
+
+static void async_commit(struct batcher *b)
+{
+ queue_work(b->wq, &b->commit_work);
+}
+
+static void continue_after_commit(struct batcher *b, struct continuation *k)
+{
+ unsigned long flags;
+ bool commit_scheduled;
+
+ spin_lock_irqsave(&b->lock, flags);
+ commit_scheduled = b->commit_scheduled;
+ list_add_tail(&k->ws.entry, &b->work_items);
+ spin_unlock_irqrestore(&b->lock, flags);
+
+ if (commit_scheduled)
+ async_commit(b);
+}
+
+/*
+ * Bios are errored if commit failed.
+ */
+static void issue_after_commit(struct batcher *b, struct bio *bio)
+{
+ unsigned long flags;
+ bool commit_scheduled;
+
+ spin_lock_irqsave(&b->lock, flags);
+ commit_scheduled = b->commit_scheduled;
+ bio_list_add(&b->bios, bio);
+ spin_unlock_irqrestore(&b->lock, flags);
+
+ if (commit_scheduled)
+ async_commit(b);
+}
+
+/*
+ * Call this if some urgent work is waiting for the commit to complete.
+ */
+static void schedule_commit(struct batcher *b)
+{
+ bool immediate;
+ unsigned long flags;
+
+ spin_lock_irqsave(&b->lock, flags);
+ immediate = !list_empty(&b->work_items) || !bio_list_empty(&b->bios);
+ b->commit_scheduled = true;
+ spin_unlock_irqrestore(&b->lock, flags);
+
+ if (immediate)
+ async_commit(b);
+}
+
+/*
* There are a couple of places where we let a bio run, but want to do some
* work before calling its endio function. We do this by temporarily
* changing the endio fn.
@@ -179,6 +352,7 @@ enum cache_io_mode {
struct cache_features {
enum cache_metadata_mode mode;
enum cache_io_mode io_mode;
+ unsigned metadata_version;
};
struct cache_stats {
@@ -188,31 +362,13 @@ struct cache_stats {
atomic_t write_miss;
atomic_t demotion;
atomic_t promotion;
+ atomic_t writeback;
atomic_t copies_avoided;
atomic_t cache_cell_clash;
atomic_t commit_count;
atomic_t discard_count;
};
-/*
- * Defines a range of cblocks, begin to (end - 1) are in the range. end is
- * the one-past-the-end value.
- */
-struct cblock_range {
- dm_cblock_t begin;
- dm_cblock_t end;
-};
-
-struct invalidation_request {
- struct list_head list;
- struct cblock_range *cblocks;
-
- atomic_t complete;
- int err;
-
- wait_queue_head_t result_wait;
-};
-
struct cache {
struct dm_target *ti;
struct dm_target_callbacks callbacks;
@@ -248,17 +404,13 @@ struct cache {
/*
* Fields for converting from sectors to blocks.
*/
- uint32_t sectors_per_block;
+ sector_t sectors_per_block;
int sectors_per_block_shift;
spinlock_t lock;
struct list_head deferred_cells;
struct bio_list deferred_bios;
- struct bio_list deferred_flush_bios;
struct bio_list deferred_writethrough_bios;
- struct list_head quiesced_migrations;
- struct list_head completed_migrations;
- struct list_head need_commit_migrations;
sector_t migration_threshold;
wait_queue_head_t migration_wait;
atomic_t nr_allocated_migrations;
@@ -269,9 +421,7 @@ struct cache {
*/
atomic_t nr_io_migrations;
- wait_queue_head_t quiescing_wait;
- atomic_t quiescing;
- atomic_t quiescing_ack;
+ struct rw_semaphore quiesce_lock;
/*
* cache_size entries, dirty if set
@@ -295,13 +445,11 @@ struct cache {
struct dm_kcopyd_client *copier;
struct workqueue_struct *wq;
- struct work_struct worker;
-
+ struct work_struct deferred_bio_worker;
+ struct work_struct deferred_writethrough_worker;
+ struct work_struct migration_worker;
struct delayed_work waker;
- unsigned long last_commit_jiffies;
-
- struct dm_bio_prison *prison;
- struct dm_deferred_set *all_io_ds;
+ struct dm_bio_prison_v2 *prison;
mempool_t *migration_pool;
@@ -328,13 +476,18 @@ struct cache {
spinlock_t invalidation_lock;
struct list_head invalidation_requests;
- struct io_tracker origin_tracker;
+ struct io_tracker tracker;
+
+ struct work_struct commit_ws;
+ struct batcher committer;
+
+ struct rw_semaphore background_work_lock;
};
struct per_bio_data {
bool tick:1;
unsigned req_nr:2;
- struct dm_deferred_entry *all_io_entry;
+ struct dm_bio_prison_cell_v2 *cell;
struct dm_hook_info hook_info;
sector_t len;
@@ -349,55 +502,64 @@ struct per_bio_data {
};
struct dm_cache_migration {
- struct list_head list;
+ struct continuation k;
struct cache *cache;
- unsigned long start_jiffies;
- dm_oblock_t old_oblock;
- dm_oblock_t new_oblock;
- dm_cblock_t cblock;
-
- bool err:1;
- bool discard:1;
- bool writeback:1;
- bool demote:1;
- bool promote:1;
- bool requeue_holder:1;
- bool invalidate:1;
+ struct policy_work *op;
+ struct bio *overwrite_bio;
+ struct dm_bio_prison_cell_v2 *cell;
- struct dm_bio_prison_cell *old_ocell;
- struct dm_bio_prison_cell *new_ocell;
+ dm_cblock_t invalidate_cblock;
+ dm_oblock_t invalidate_oblock;
};
-/*
- * Processing a bio in the worker thread may require these memory
- * allocations. We prealloc to avoid deadlocks (the same worker thread
- * frees them back to the mempool).
- */
-struct prealloc {
- struct dm_cache_migration *mg;
- struct dm_bio_prison_cell *cell1;
- struct dm_bio_prison_cell *cell2;
-};
+/*----------------------------------------------------------------*/
+
+static bool writethrough_mode(struct cache_features *f)
+{
+ return f->io_mode == CM_IO_WRITETHROUGH;
+}
-static enum cache_metadata_mode get_cache_mode(struct cache *cache);
+static bool writeback_mode(struct cache_features *f)
+{
+ return f->io_mode == CM_IO_WRITEBACK;
+}
-static void wake_worker(struct cache *cache)
+static inline bool passthrough_mode(struct cache_features *f)
{
- queue_work(cache->wq, &cache->worker);
+ return unlikely(f->io_mode == CM_IO_PASSTHROUGH);
}
/*----------------------------------------------------------------*/
-static struct dm_bio_prison_cell *alloc_prison_cell(struct cache *cache)
+static void wake_deferred_bio_worker(struct cache *cache)
+{
+ queue_work(cache->wq, &cache->deferred_bio_worker);
+}
+
+static void wake_deferred_writethrough_worker(struct cache *cache)
{
- /* FIXME: change to use a local slab. */
- return dm_bio_prison_alloc_cell(cache->prison, GFP_NOWAIT);
+ queue_work(cache->wq, &cache->deferred_writethrough_worker);
}
-static void free_prison_cell(struct cache *cache, struct dm_bio_prison_cell *cell)
+static void wake_migration_worker(struct cache *cache)
{
- dm_bio_prison_free_cell(cache->prison, cell);
+ if (passthrough_mode(&cache->features))
+ return;
+
+ queue_work(cache->wq, &cache->migration_worker);
+}
+
+/*----------------------------------------------------------------*/
+
+static struct dm_bio_prison_cell_v2 *alloc_prison_cell(struct cache *cache)
+{
+ return dm_bio_prison_alloc_cell_v2(cache->prison, GFP_NOWAIT);
+}
+
+static void free_prison_cell(struct cache *cache, struct dm_bio_prison_cell_v2 *cell)
+{
+ dm_bio_prison_free_cell_v2(cache->prison, cell);
}
static struct dm_cache_migration *alloc_migration(struct cache *cache)
@@ -423,146 +585,127 @@ static void free_migration(struct dm_cache_migration *mg)
mempool_free(mg, cache->migration_pool);
}
-static int prealloc_data_structs(struct cache *cache, struct prealloc *p)
-{
- if (!p->mg) {
- p->mg = alloc_migration(cache);
- if (!p->mg)
- return -ENOMEM;
- }
-
- if (!p->cell1) {
- p->cell1 = alloc_prison_cell(cache);
- if (!p->cell1)
- return -ENOMEM;
- }
-
- if (!p->cell2) {
- p->cell2 = alloc_prison_cell(cache);
- if (!p->cell2)
- return -ENOMEM;
- }
-
- return 0;
-}
+/*----------------------------------------------------------------*/
-static void prealloc_free_structs(struct cache *cache, struct prealloc *p)
+static inline dm_oblock_t oblock_succ(dm_oblock_t b)
{
- if (p->cell2)
- free_prison_cell(cache, p->cell2);
-
- if (p->cell1)
- free_prison_cell(cache, p->cell1);
-
- if (p->mg)
- free_migration(p->mg);
+ return to_oblock(from_oblock(b) + 1ull);
}
-static struct dm_cache_migration *prealloc_get_migration(struct prealloc *p)
+static void build_key(dm_oblock_t begin, dm_oblock_t end, struct dm_cell_key_v2 *key)
{
- struct dm_cache_migration *mg = p->mg;
-
- BUG_ON(!mg);
- p->mg = NULL;
-
- return mg;
+ key->virtual = 0;
+ key->dev = 0;
+ key->block_begin = from_oblock(begin);
+ key->block_end = from_oblock(end);
}
/*
- * You must have a cell within the prealloc struct to return. If not this
- * function will BUG() rather than returning NULL.
+ * We have two lock levels. Level 0, which is used to prevent WRITEs, and
+ * level 1 which prevents *both* READs and WRITEs.
*/
-static struct dm_bio_prison_cell *prealloc_get_cell(struct prealloc *p)
+#define WRITE_LOCK_LEVEL 0
+#define READ_WRITE_LOCK_LEVEL 1
+
+static unsigned lock_level(struct bio *bio)
{
- struct dm_bio_prison_cell *r = NULL;
+ return bio_data_dir(bio) == WRITE ?
+ WRITE_LOCK_LEVEL :
+ READ_WRITE_LOCK_LEVEL;
+}
- if (p->cell1) {
- r = p->cell1;
- p->cell1 = NULL;
+/*----------------------------------------------------------------
+ * Per bio data
+ *--------------------------------------------------------------*/
- } else if (p->cell2) {
- r = p->cell2;
- p->cell2 = NULL;
- } else
- BUG();
+/*
+ * If using writeback, leave out struct per_bio_data's writethrough fields.
+ */
+#define PB_DATA_SIZE_WB (offsetof(struct per_bio_data, cache))
+#define PB_DATA_SIZE_WT (sizeof(struct per_bio_data))
- return r;
+static size_t get_per_bio_data_size(struct cache *cache)
+{
+ return writethrough_mode(&cache->features) ? PB_DATA_SIZE_WT : PB_DATA_SIZE_WB;
}
-/*
- * You can't have more than two cells in a prealloc struct. BUG() will be
- * called if you try and overfill.
- */
-static void prealloc_put_cell(struct prealloc *p, struct dm_bio_prison_cell *cell)
+static struct per_bio_data *get_per_bio_data(struct bio *bio, size_t data_size)
{
- if (!p->cell2)
- p->cell2 = cell;
+ struct per_bio_data *pb = dm_per_bio_data(bio, data_size);
+ BUG_ON(!pb);
+ return pb;
+}
- else if (!p->cell1)
- p->cell1 = cell;
+static struct per_bio_data *init_per_bio_data(struct bio *bio, size_t data_size)
+{
+ struct per_bio_data *pb = get_per_bio_data(bio, data_size);
- else
- BUG();
+ pb->tick = false;
+ pb->req_nr = dm_bio_get_target_bio_nr(bio);
+ pb->cell = NULL;
+ pb->len = 0;
+
+ return pb;
}
/*----------------------------------------------------------------*/
-static void build_key(dm_oblock_t begin, dm_oblock_t end, struct dm_cell_key *key)
+static void defer_bio(struct cache *cache, struct bio *bio)
{
- key->virtual = 0;
- key->dev = 0;
- key->block_begin = from_oblock(begin);
- key->block_end = from_oblock(end);
-}
+ unsigned long flags;
-/*
- * The caller hands in a preallocated cell, and a free function for it.
- * The cell will be freed if there's an error, or if it wasn't used because
- * a cell with that key already exists.
- */
-typedef void (*cell_free_fn)(void *context, struct dm_bio_prison_cell *cell);
+ spin_lock_irqsave(&cache->lock, flags);
+ bio_list_add(&cache->deferred_bios, bio);
+ spin_unlock_irqrestore(&cache->lock, flags);
-static int bio_detain_range(struct cache *cache, dm_oblock_t oblock_begin, dm_oblock_t oblock_end,
- struct bio *bio, struct dm_bio_prison_cell *cell_prealloc,
- cell_free_fn free_fn, void *free_context,
- struct dm_bio_prison_cell **cell_result)
+ wake_deferred_bio_worker(cache);
+}
+
+static void defer_bios(struct cache *cache, struct bio_list *bios)
{
- int r;
- struct dm_cell_key key;
+ unsigned long flags;
- build_key(oblock_begin, oblock_end, &key);
- r = dm_bio_detain(cache->prison, &key, bio, cell_prealloc, cell_result);
- if (r)
- free_fn(free_context, cell_prealloc);
+ spin_lock_irqsave(&cache->lock, flags);
+ bio_list_merge(&cache->deferred_bios, bios);
+ bio_list_init(bios);
+ spin_unlock_irqrestore(&cache->lock, flags);
- return r;
+ wake_deferred_bio_worker(cache);
}
-static int bio_detain(struct cache *cache, dm_oblock_t oblock,
- struct bio *bio, struct dm_bio_prison_cell *cell_prealloc,
- cell_free_fn free_fn, void *free_context,
- struct dm_bio_prison_cell **cell_result)
+/*----------------------------------------------------------------*/
+
+static bool bio_detain_shared(struct cache *cache, dm_oblock_t oblock, struct bio *bio)
{
+ bool r;
+ size_t pb_size;
+ struct per_bio_data *pb;
+ struct dm_cell_key_v2 key;
dm_oblock_t end = to_oblock(from_oblock(oblock) + 1ULL);
- return bio_detain_range(cache, oblock, end, bio,
- cell_prealloc, free_fn, free_context, cell_result);
-}
+ struct dm_bio_prison_cell_v2 *cell_prealloc, *cell;
-static int get_cell(struct cache *cache,
- dm_oblock_t oblock,
- struct prealloc *structs,
- struct dm_bio_prison_cell **cell_result)
-{
- int r;
- struct dm_cell_key key;
- struct dm_bio_prison_cell *cell_prealloc;
+ cell_prealloc = alloc_prison_cell(cache); /* FIXME: allow wait if calling from worker */
+ if (!cell_prealloc) {
+ defer_bio(cache, bio);
+ return false;
+ }
+
+ build_key(oblock, end, &key);
+ r = dm_cell_get_v2(cache->prison, &key, lock_level(bio), bio, cell_prealloc, &cell);
+ if (!r) {
+ /*
+ * Failed to get the lock.
+ */
+ free_prison_cell(cache, cell_prealloc);
+ return r;
+ }
- cell_prealloc = prealloc_get_cell(structs);
+ if (cell != cell_prealloc)
+ free_prison_cell(cache, cell_prealloc);
- build_key(oblock, to_oblock(from_oblock(oblock) + 1ULL), &key);
- r = dm_get_cell(cache->prison, &key, cell_prealloc, cell_result);
- if (r)
- prealloc_put_cell(structs, cell_prealloc);
+ pb_size = get_per_bio_data_size(cache);
+ pb = get_per_bio_data(bio, pb_size);
+ pb->cell = cell;
return r;
}
@@ -574,21 +717,33 @@ static bool is_dirty(struct cache *cache, dm_cblock_t b)
return test_bit(from_cblock(b), cache->dirty_bitset);
}
-static void set_dirty(struct cache *cache, dm_oblock_t oblock, dm_cblock_t cblock)
+static void set_dirty(struct cache *cache, dm_cblock_t cblock)
{
if (!test_and_set_bit(from_cblock(cblock), cache->dirty_bitset)) {
atomic_inc(&cache->nr_dirty);
- policy_set_dirty(cache->policy, oblock);
+ policy_set_dirty(cache->policy, cblock);
}
}
-static void clear_dirty(struct cache *cache, dm_oblock_t oblock, dm_cblock_t cblock)
+/*
+ * These two are called when setting after migrations to force the policy
+ * and dirty bitset to be in sync.
+ */
+static void force_set_dirty(struct cache *cache, dm_cblock_t cblock)
+{
+ if (!test_and_set_bit(from_cblock(cblock), cache->dirty_bitset))
+ atomic_inc(&cache->nr_dirty);
+ policy_set_dirty(cache->policy, cblock);
+}
+
+static void force_clear_dirty(struct cache *cache, dm_cblock_t cblock)
{
if (test_and_clear_bit(from_cblock(cblock), cache->dirty_bitset)) {
- policy_clear_dirty(cache->policy, oblock);
if (atomic_dec_return(&cache->nr_dirty) == 0)
dm_table_event(cache->ti->table);
}
+
+ policy_clear_dirty(cache->policy, cblock);
}
/*----------------------------------------------------------------*/
@@ -627,11 +782,6 @@ static dm_dblock_t oblock_to_dblock(struct cache *cache, dm_oblock_t oblock)
oblocks_per_dblock(cache)));
}
-static dm_oblock_t dblock_to_oblock(struct cache *cache, dm_dblock_t dblock)
-{
- return to_oblock(from_dblock(dblock) * oblocks_per_dblock(cache));
-}
-
static void set_discard(struct cache *cache, dm_dblock_t b)
{
unsigned long flags;
@@ -678,83 +828,6 @@ static bool is_discarded_oblock(struct cache *cache, dm_oblock_t b)
return r;
}
-/*----------------------------------------------------------------*/
-
-static void load_stats(struct cache *cache)
-{
- struct dm_cache_statistics stats;
-
- dm_cache_metadata_get_stats(cache->cmd, &stats);
- atomic_set(&cache->stats.read_hit, stats.read_hits);
- atomic_set(&cache->stats.read_miss, stats.read_misses);
- atomic_set(&cache->stats.write_hit, stats.write_hits);
- atomic_set(&cache->stats.write_miss, stats.write_misses);
-}
-
-static void save_stats(struct cache *cache)
-{
- struct dm_cache_statistics stats;
-
- if (get_cache_mode(cache) >= CM_READ_ONLY)
- return;
-
- stats.read_hits = atomic_read(&cache->stats.read_hit);
- stats.read_misses = atomic_read(&cache->stats.read_miss);
- stats.write_hits = atomic_read(&cache->stats.write_hit);
- stats.write_misses = atomic_read(&cache->stats.write_miss);
-
- dm_cache_metadata_set_stats(cache->cmd, &stats);
-}
-
-/*----------------------------------------------------------------
- * Per bio data
- *--------------------------------------------------------------*/
-
-/*
- * If using writeback, leave out struct per_bio_data's writethrough fields.
- */
-#define PB_DATA_SIZE_WB (offsetof(struct per_bio_data, cache))
-#define PB_DATA_SIZE_WT (sizeof(struct per_bio_data))
-
-static bool writethrough_mode(struct cache_features *f)
-{
- return f->io_mode == CM_IO_WRITETHROUGH;
-}
-
-static bool writeback_mode(struct cache_features *f)
-{
- return f->io_mode == CM_IO_WRITEBACK;
-}
-
-static bool passthrough_mode(struct cache_features *f)
-{
- return f->io_mode == CM_IO_PASSTHROUGH;
-}
-
-static size_t get_per_bio_data_size(struct cache *cache)
-{
- return writethrough_mode(&cache->features) ? PB_DATA_SIZE_WT : PB_DATA_SIZE_WB;
-}
-
-static struct per_bio_data *get_per_bio_data(struct bio *bio, size_t data_size)
-{
- struct per_bio_data *pb = dm_per_bio_data(bio, data_size);
- BUG_ON(!pb);
- return pb;
-}
-
-static struct per_bio_data *init_per_bio_data(struct bio *bio, size_t data_size)
-{
- struct per_bio_data *pb = get_per_bio_data(bio, data_size);
-
- pb->tick = false;
- pb->req_nr = dm_bio_get_target_bio_nr(bio);
- pb->all_io_entry = NULL;
- pb->len = 0;
-
- return pb;
-}
-
/*----------------------------------------------------------------
* Remapping
*--------------------------------------------------------------*/
@@ -787,8 +860,7 @@ static void check_if_tick_bio_needed(struct cache *cache, struct bio *bio)
struct per_bio_data *pb = get_per_bio_data(bio, pb_data_size);
spin_lock_irqsave(&cache->lock, flags);
- if (cache->need_tick_bio &&
- !(bio->bi_opf & (REQ_FUA | REQ_PREFLUSH)) &&
+ if (cache->need_tick_bio && !op_is_flush(bio->bi_opf) &&
bio_op(bio) != REQ_OP_DISCARD) {
pb->tick = true;
cache->need_tick_bio = false;
@@ -797,8 +869,9 @@ static void check_if_tick_bio_needed(struct cache *cache, struct bio *bio)
}
static void remap_to_origin_clear_discard(struct cache *cache, struct bio *bio,
- dm_oblock_t oblock)
+ dm_oblock_t oblock)
{
+ // FIXME: this is called way too much.
check_if_tick_bio_needed(cache, bio);
remap_to_origin(cache, bio);
if (bio_data_dir(bio) == WRITE)
@@ -811,7 +884,7 @@ static void remap_to_cache_dirty(struct cache *cache, struct bio *bio,
check_if_tick_bio_needed(cache, bio);
remap_to_cache(cache, bio, cblock);
if (bio_data_dir(bio) == WRITE) {
- set_dirty(cache, oblock, cblock);
+ set_dirty(cache, cblock);
clear_discard(cache, oblock_to_dblock(cache, oblock));
}
}
@@ -828,31 +901,9 @@ static dm_oblock_t get_bio_block(struct cache *cache, struct bio *bio)
return to_oblock(block_nr);
}
-static int bio_triggers_commit(struct cache *cache, struct bio *bio)
-{
- return bio->bi_opf & (REQ_PREFLUSH | REQ_FUA);
-}
-
-/*
- * You must increment the deferred set whilst the prison cell is held. To
- * encourage this, we ask for 'cell' to be passed in.
- */
-static void inc_ds(struct cache *cache, struct bio *bio,
- struct dm_bio_prison_cell *cell)
-{
- size_t pb_data_size = get_per_bio_data_size(cache);
- struct per_bio_data *pb = get_per_bio_data(bio, pb_data_size);
-
- BUG_ON(!cell);
- BUG_ON(pb->all_io_entry);
-
- pb->all_io_entry = dm_deferred_entry_inc(cache->all_io_ds);
-}
-
static bool accountable_bio(struct cache *cache, struct bio *bio)
{
- return ((bio->bi_bdev == cache->origin_dev->bdev) &&
- bio_op(bio) != REQ_OP_DISCARD);
+ return bio_op(bio) != REQ_OP_DISCARD;
}
static void accounted_begin(struct cache *cache, struct bio *bio)
@@ -862,7 +913,7 @@ static void accounted_begin(struct cache *cache, struct bio *bio)
if (accountable_bio(cache, bio)) {
pb->len = bio_sectors(bio);
- iot_io_begin(&cache->origin_tracker, pb->len);
+ iot_io_begin(&cache->tracker, pb->len);
}
}
@@ -871,7 +922,7 @@ static void accounted_complete(struct cache *cache, struct bio *bio)
size_t pb_data_size = get_per_bio_data_size(cache);
struct per_bio_data *pb = get_per_bio_data(bio, pb_data_size);
- iot_io_end(&cache->origin_tracker, pb->len);
+ iot_io_end(&cache->tracker, pb->len);
}
static void accounted_request(struct cache *cache, struct bio *bio)
@@ -880,29 +931,10 @@ static void accounted_request(struct cache *cache, struct bio *bio)
generic_make_request(bio);
}
-static void issue(struct cache *cache, struct bio *bio)
+static void issue_op(struct bio *bio, void *context)
{
- unsigned long flags;
-
- if (!bio_triggers_commit(cache, bio)) {
- accounted_request(cache, bio);
- return;
- }
-
- /*
- * Batch together any bios that trigger commits and then issue a
- * single commit for them in do_worker().
- */
- spin_lock_irqsave(&cache->lock, flags);
- cache->commit_requested = true;
- bio_list_add(&cache->deferred_flush_bios, bio);
- spin_unlock_irqrestore(&cache->lock, flags);
-}
-
-static void inc_and_issue(struct cache *cache, struct bio *bio, struct dm_bio_prison_cell *cell)
-{
- inc_ds(cache, bio, cell);
- issue(cache, bio);
+ struct cache *cache = context;
+ accounted_request(cache, bio);
}
static void defer_writethrough_bio(struct cache *cache, struct bio *bio)
@@ -913,7 +945,7 @@ static void defer_writethrough_bio(struct cache *cache, struct bio *bio)
bio_list_add(&cache->deferred_writethrough_bios, bio);
spin_unlock_irqrestore(&cache->lock, flags);
- wake_worker(cache);
+ wake_deferred_writethrough_worker(cache);
}
static void writethrough_endio(struct bio *bio)
@@ -922,7 +954,7 @@ static void writethrough_endio(struct bio *bio)
dm_unhook_bio(&pb->hook_info, bio);
- if (bio->bi_error) {
+ if (bio->bi_status) {
bio_endio(bio);
return;
}
@@ -939,6 +971,7 @@ static void writethrough_endio(struct bio *bio)
}
/*
+ * FIXME: send in parallel, huge latency as is.
* When running in writethrough mode we need to send writes to clean blocks
* to both the cache and origin devices. In future we'd like to clone the
* bio and send them in parallel, but for now we're doing them in
@@ -1051,12 +1084,58 @@ static void metadata_operation_failed(struct cache *cache, const char *op, int r
set_cache_mode(cache, CM_READ_ONLY);
}
+/*----------------------------------------------------------------*/
+
+static void load_stats(struct cache *cache)
+{
+ struct dm_cache_statistics stats;
+
+ dm_cache_metadata_get_stats(cache->cmd, &stats);
+ atomic_set(&cache->stats.read_hit, stats.read_hits);
+ atomic_set(&cache->stats.read_miss, stats.read_misses);
+ atomic_set(&cache->stats.write_hit, stats.write_hits);
+ atomic_set(&cache->stats.write_miss, stats.write_misses);
+}
+
+static void save_stats(struct cache *cache)
+{
+ struct dm_cache_statistics stats;
+
+ if (get_cache_mode(cache) >= CM_READ_ONLY)
+ return;
+
+ stats.read_hits = atomic_read(&cache->stats.read_hit);
+ stats.read_misses = atomic_read(&cache->stats.read_miss);
+ stats.write_hits = atomic_read(&cache->stats.write_hit);
+ stats.write_misses = atomic_read(&cache->stats.write_miss);
+
+ dm_cache_metadata_set_stats(cache->cmd, &stats);
+}
+
+static void update_stats(struct cache_stats *stats, enum policy_operation op)
+{
+ switch (op) {
+ case POLICY_PROMOTE:
+ atomic_inc(&stats->promotion);
+ break;
+
+ case POLICY_DEMOTE:
+ atomic_inc(&stats->demotion);
+ break;
+
+ case POLICY_WRITEBACK:
+ atomic_inc(&stats->writeback);
+ break;
+ }
+}
+
/*----------------------------------------------------------------
* Migration processing
*
* Migration covers moving data from the origin device to the cache, or
* vice versa.
*--------------------------------------------------------------*/
+
static void inc_io_migrations(struct cache *cache)
{
atomic_inc(&cache->nr_io_migrations);
@@ -1069,217 +1148,112 @@ static void dec_io_migrations(struct cache *cache)
static bool discard_or_flush(struct bio *bio)
{
- return bio_op(bio) == REQ_OP_DISCARD ||
- bio->bi_opf & (REQ_PREFLUSH | REQ_FUA);
-}
-
-static void __cell_defer(struct cache *cache, struct dm_bio_prison_cell *cell)
-{
- if (discard_or_flush(cell->holder)) {
- /*
- * We have to handle these bios individually.
- */
- dm_cell_release(cache->prison, cell, &cache->deferred_bios);
- free_prison_cell(cache, cell);
- } else
- list_add_tail(&cell->user_list, &cache->deferred_cells);
+ return bio_op(bio) == REQ_OP_DISCARD || op_is_flush(bio->bi_opf);
}
-static void cell_defer(struct cache *cache, struct dm_bio_prison_cell *cell, bool holder)
+static void calc_discard_block_range(struct cache *cache, struct bio *bio,
+ dm_dblock_t *b, dm_dblock_t *e)
{
- unsigned long flags;
-
- if (!holder && dm_cell_promote_or_release(cache->prison, cell)) {
- /*
- * There was no prisoner to promote to holder, the
- * cell has been released.
- */
- free_prison_cell(cache, cell);
- return;
- }
+ sector_t sb = bio->bi_iter.bi_sector;
+ sector_t se = bio_end_sector(bio);
- spin_lock_irqsave(&cache->lock, flags);
- __cell_defer(cache, cell);
- spin_unlock_irqrestore(&cache->lock, flags);
+ *b = to_dblock(dm_sector_div_up(sb, cache->discard_block_size));
- wake_worker(cache);
+ if (se - sb < cache->discard_block_size)
+ *e = *b;
+ else
+ *e = to_dblock(block_div(se, cache->discard_block_size));
}
-static void cell_error_with_code(struct cache *cache, struct dm_bio_prison_cell *cell, int err)
-{
- dm_cell_error(cache->prison, cell, err);
- free_prison_cell(cache, cell);
-}
+/*----------------------------------------------------------------*/
-static void cell_requeue(struct cache *cache, struct dm_bio_prison_cell *cell)
+static void prevent_background_work(struct cache *cache)
{
- cell_error_with_code(cache, cell, DM_ENDIO_REQUEUE);
+ lockdep_off();
+ down_write(&cache->background_work_lock);
+ lockdep_on();
}
-static void free_io_migration(struct dm_cache_migration *mg)
+static void allow_background_work(struct cache *cache)
{
- struct cache *cache = mg->cache;
-
- dec_io_migrations(cache);
- free_migration(mg);
- wake_worker(cache);
+ lockdep_off();
+ up_write(&cache->background_work_lock);
+ lockdep_on();
}
-static void migration_failure(struct dm_cache_migration *mg)
+static bool background_work_begin(struct cache *cache)
{
- struct cache *cache = mg->cache;
- const char *dev_name = cache_device_name(cache);
-
- if (mg->writeback) {
- DMERR_LIMIT("%s: writeback failed; couldn't copy block", dev_name);
- set_dirty(cache, mg->old_oblock, mg->cblock);
- cell_defer(cache, mg->old_ocell, false);
-
- } else if (mg->demote) {
- DMERR_LIMIT("%s: demotion failed; couldn't copy block", dev_name);
- policy_force_mapping(cache->policy, mg->new_oblock, mg->old_oblock);
+ bool r;
- cell_defer(cache, mg->old_ocell, mg->promote ? false : true);
- if (mg->promote)
- cell_defer(cache, mg->new_ocell, true);
- } else {
- DMERR_LIMIT("%s: promotion failed; couldn't copy block", dev_name);
- policy_remove_mapping(cache->policy, mg->new_oblock);
- cell_defer(cache, mg->new_ocell, true);
- }
+ lockdep_off();
+ r = down_read_trylock(&cache->background_work_lock);
+ lockdep_on();
- free_io_migration(mg);
+ return r;
}
-static void migration_success_pre_commit(struct dm_cache_migration *mg)
+static void background_work_end(struct cache *cache)
{
- int r;
- unsigned long flags;
- struct cache *cache = mg->cache;
-
- if (mg->writeback) {
- clear_dirty(cache, mg->old_oblock, mg->cblock);
- cell_defer(cache, mg->old_ocell, false);
- free_io_migration(mg);
- return;
+ lockdep_off();
+ up_read(&cache->background_work_lock);
+ lockdep_on();
+}
- } else if (mg->demote) {
- r = dm_cache_remove_mapping(cache->cmd, mg->cblock);
- if (r) {
- DMERR_LIMIT("%s: demotion failed; couldn't update on disk metadata",
- cache_device_name(cache));
- metadata_operation_failed(cache, "dm_cache_remove_mapping", r);
- policy_force_mapping(cache->policy, mg->new_oblock,
- mg->old_oblock);
- if (mg->promote)
- cell_defer(cache, mg->new_ocell, true);
- free_io_migration(mg);
- return;
- }
- } else {
- r = dm_cache_insert_mapping(cache->cmd, mg->cblock, mg->new_oblock);
- if (r) {
- DMERR_LIMIT("%s: promotion failed; couldn't update on disk metadata",
- cache_device_name(cache));
- metadata_operation_failed(cache, "dm_cache_insert_mapping", r);
- policy_remove_mapping(cache->policy, mg->new_oblock);
- free_io_migration(mg);
- return;
- }
- }
+/*----------------------------------------------------------------*/
- spin_lock_irqsave(&cache->lock, flags);
- list_add_tail(&mg->list, &cache->need_commit_migrations);
- cache->commit_requested = true;
- spin_unlock_irqrestore(&cache->lock, flags);
+static void quiesce(struct dm_cache_migration *mg,
+ void (*continuation)(struct work_struct *))
+{
+ init_continuation(&mg->k, continuation);
+ dm_cell_quiesce_v2(mg->cache->prison, mg->cell, &mg->k.ws);
}
-static void migration_success_post_commit(struct dm_cache_migration *mg)
+static struct dm_cache_migration *ws_to_mg(struct work_struct *ws)
{
- unsigned long flags;
- struct cache *cache = mg->cache;
-
- if (mg->writeback) {
- DMWARN_LIMIT("%s: writeback unexpectedly triggered commit",
- cache_device_name(cache));
- return;
-
- } else if (mg->demote) {
- cell_defer(cache, mg->old_ocell, mg->promote ? false : true);
-
- if (mg->promote) {
- mg->demote = false;
-
- spin_lock_irqsave(&cache->lock, flags);
- list_add_tail(&mg->list, &cache->quiesced_migrations);
- spin_unlock_irqrestore(&cache->lock, flags);
-
- } else {
- if (mg->invalidate)
- policy_remove_mapping(cache->policy, mg->old_oblock);
- free_io_migration(mg);
- }
-
- } else {
- if (mg->requeue_holder) {
- clear_dirty(cache, mg->new_oblock, mg->cblock);
- cell_defer(cache, mg->new_ocell, true);
- } else {
- /*
- * The block was promoted via an overwrite, so it's dirty.
- */
- set_dirty(cache, mg->new_oblock, mg->cblock);
- bio_endio(mg->new_ocell->holder);
- cell_defer(cache, mg->new_ocell, false);
- }
- free_io_migration(mg);
- }
+ struct continuation *k = container_of(ws, struct continuation, ws);
+ return container_of(k, struct dm_cache_migration, k);
}
static void copy_complete(int read_err, unsigned long write_err, void *context)
{
- unsigned long flags;
- struct dm_cache_migration *mg = (struct dm_cache_migration *) context;
- struct cache *cache = mg->cache;
+ struct dm_cache_migration *mg = container_of(context, struct dm_cache_migration, k);
if (read_err || write_err)
- mg->err = true;
-
- spin_lock_irqsave(&cache->lock, flags);
- list_add_tail(&mg->list, &cache->completed_migrations);
- spin_unlock_irqrestore(&cache->lock, flags);
+ mg->k.input = BLK_STS_IOERR;
- wake_worker(cache);
+ queue_continuation(mg->cache->wq, &mg->k);
}
-static void issue_copy(struct dm_cache_migration *mg)
+static int copy(struct dm_cache_migration *mg, bool promote)
{
int r;
struct dm_io_region o_region, c_region;
struct cache *cache = mg->cache;
- sector_t cblock = from_cblock(mg->cblock);
o_region.bdev = cache->origin_dev->bdev;
+ o_region.sector = from_oblock(mg->op->oblock) * cache->sectors_per_block;
o_region.count = cache->sectors_per_block;
c_region.bdev = cache->cache_dev->bdev;
- c_region.sector = cblock * cache->sectors_per_block;
+ c_region.sector = from_cblock(mg->op->cblock) * cache->sectors_per_block;
c_region.count = cache->sectors_per_block;
- if (mg->writeback || mg->demote) {
- /* demote */
- o_region.sector = from_oblock(mg->old_oblock) * cache->sectors_per_block;
- r = dm_kcopyd_copy(cache->copier, &c_region, 1, &o_region, 0, copy_complete, mg);
- } else {
- /* promote */
- o_region.sector = from_oblock(mg->new_oblock) * cache->sectors_per_block;
- r = dm_kcopyd_copy(cache->copier, &o_region, 1, &c_region, 0, copy_complete, mg);
- }
+ if (promote)
+ r = dm_kcopyd_copy(cache->copier, &o_region, 1, &c_region, 0, copy_complete, &mg->k);
+ else
+ r = dm_kcopyd_copy(cache->copier, &c_region, 1, &o_region, 0, copy_complete, &mg->k);
- if (r < 0) {
- DMERR_LIMIT("%s: issuing migration failed", cache_device_name(cache));
- migration_failure(mg);
- }
+ return r;
+}
+
+static void bio_drop_shared_lock(struct cache *cache, struct bio *bio)
+{
+ size_t pb_data_size = get_per_bio_data_size(cache);
+ struct per_bio_data *pb = get_per_bio_data(bio, pb_data_size);
+
+ if (pb->cell && dm_cell_put_v2(cache->prison, pb->cell))
+ free_prison_cell(cache, pb->cell);
+ pb->cell = NULL;
}
static void overwrite_endio(struct bio *bio)
@@ -1288,368 +1262,476 @@ static void overwrite_endio(struct bio *bio)
struct cache *cache = mg->cache;
size_t pb_data_size = get_per_bio_data_size(cache);
struct per_bio_data *pb = get_per_bio_data(bio, pb_data_size);
- unsigned long flags;
dm_unhook_bio(&pb->hook_info, bio);
- if (bio->bi_error)
- mg->err = true;
-
- mg->requeue_holder = false;
-
- spin_lock_irqsave(&cache->lock, flags);
- list_add_tail(&mg->list, &cache->completed_migrations);
- spin_unlock_irqrestore(&cache->lock, flags);
+ if (bio->bi_status)
+ mg->k.input = bio->bi_status;
- wake_worker(cache);
+ queue_continuation(mg->cache->wq, &mg->k);
}
-static void issue_overwrite(struct dm_cache_migration *mg, struct bio *bio)
+static void overwrite(struct dm_cache_migration *mg,
+ void (*continuation)(struct work_struct *))
{
+ struct bio *bio = mg->overwrite_bio;
size_t pb_data_size = get_per_bio_data_size(mg->cache);
struct per_bio_data *pb = get_per_bio_data(bio, pb_data_size);
dm_hook_bio(&pb->hook_info, bio, overwrite_endio, mg);
- remap_to_cache_dirty(mg->cache, bio, mg->new_oblock, mg->cblock);
/*
- * No need to inc_ds() here, since the cell will be held for the
- * duration of the io.
+ * The overwrite bio is part of the copy operation, as such it does
+ * not set/clear discard or dirty flags.
*/
+ if (mg->op->op == POLICY_PROMOTE)
+ remap_to_cache(mg->cache, bio, mg->op->cblock);
+ else
+ remap_to_origin(mg->cache, bio);
+
+ init_continuation(&mg->k, continuation);
accounted_request(mg->cache, bio);
}
-static bool bio_writes_complete_block(struct cache *cache, struct bio *bio)
+/*
+ * Migration steps:
+ *
+ * 1) exclusive lock preventing WRITEs
+ * 2) quiesce
+ * 3) copy or issue overwrite bio
+ * 4) upgrade to exclusive lock preventing READs and WRITEs
+ * 5) quiesce
+ * 6) update metadata and commit
+ * 7) unlock
+ */
+static void mg_complete(struct dm_cache_migration *mg, bool success)
{
- return (bio_data_dir(bio) == WRITE) &&
- (bio->bi_iter.bi_size == (cache->sectors_per_block << SECTOR_SHIFT));
-}
+ struct bio_list bios;
+ struct cache *cache = mg->cache;
+ struct policy_work *op = mg->op;
+ dm_cblock_t cblock = op->cblock;
+
+ if (success)
+ update_stats(&cache->stats, op->op);
+
+ switch (op->op) {
+ case POLICY_PROMOTE:
+ clear_discard(cache, oblock_to_dblock(cache, op->oblock));
+ policy_complete_background_work(cache->policy, op, success);
+
+ if (mg->overwrite_bio) {
+ if (success)
+ force_set_dirty(cache, cblock);
+ else if (mg->k.input)
+ mg->overwrite_bio->bi_status = mg->k.input;
+ else
+ mg->overwrite_bio->bi_status = BLK_STS_IOERR;
+ bio_endio(mg->overwrite_bio);
+ } else {
+ if (success)
+ force_clear_dirty(cache, cblock);
+ dec_io_migrations(cache);
+ }
+ break;
-static void avoid_copy(struct dm_cache_migration *mg)
-{
- atomic_inc(&mg->cache->stats.copies_avoided);
- migration_success_pre_commit(mg);
-}
+ case POLICY_DEMOTE:
+ /*
+ * We clear dirty here to update the nr_dirty counter.
+ */
+ if (success)
+ force_clear_dirty(cache, cblock);
+ policy_complete_background_work(cache->policy, op, success);
+ dec_io_migrations(cache);
+ break;
-static void calc_discard_block_range(struct cache *cache, struct bio *bio,
- dm_dblock_t *b, dm_dblock_t *e)
-{
- sector_t sb = bio->bi_iter.bi_sector;
- sector_t se = bio_end_sector(bio);
+ case POLICY_WRITEBACK:
+ if (success)
+ force_clear_dirty(cache, cblock);
+ policy_complete_background_work(cache->policy, op, success);
+ dec_io_migrations(cache);
+ break;
+ }
- *b = to_dblock(dm_sector_div_up(sb, cache->discard_block_size));
+ bio_list_init(&bios);
+ if (mg->cell) {
+ if (dm_cell_unlock_v2(cache->prison, mg->cell, &bios))
+ free_prison_cell(cache, mg->cell);
+ }
- if (se - sb < cache->discard_block_size)
- *e = *b;
- else
- *e = to_dblock(block_div(se, cache->discard_block_size));
+ free_migration(mg);
+ defer_bios(cache, &bios);
+ wake_migration_worker(cache);
+
+ background_work_end(cache);
}
-static void issue_discard(struct dm_cache_migration *mg)
+static void mg_success(struct work_struct *ws)
{
- dm_dblock_t b, e;
- struct bio *bio = mg->new_ocell->holder;
- struct cache *cache = mg->cache;
-
- calc_discard_block_range(cache, bio, &b, &e);
- while (b != e) {
- set_discard(cache, b);
- b = to_dblock(from_dblock(b) + 1);
- }
-
- bio_endio(bio);
- cell_defer(cache, mg->new_ocell, false);
- free_migration(mg);
- wake_worker(cache);
+ struct dm_cache_migration *mg = ws_to_mg(ws);
+ mg_complete(mg, mg->k.input == 0);
}
-static void issue_copy_or_discard(struct dm_cache_migration *mg)
+static void mg_update_metadata(struct work_struct *ws)
{
- bool avoid;
+ int r;
+ struct dm_cache_migration *mg = ws_to_mg(ws);
struct cache *cache = mg->cache;
+ struct policy_work *op = mg->op;
- if (mg->discard) {
- issue_discard(mg);
- return;
- }
+ switch (op->op) {
+ case POLICY_PROMOTE:
+ r = dm_cache_insert_mapping(cache->cmd, op->cblock, op->oblock);
+ if (r) {
+ DMERR_LIMIT("%s: migration failed; couldn't insert mapping",
+ cache_device_name(cache));
+ metadata_operation_failed(cache, "dm_cache_insert_mapping", r);
- if (mg->writeback || mg->demote)
- avoid = !is_dirty(cache, mg->cblock) ||
- is_discarded_oblock(cache, mg->old_oblock);
- else {
- struct bio *bio = mg->new_ocell->holder;
+ mg_complete(mg, false);
+ return;
+ }
+ mg_complete(mg, true);
+ break;
- avoid = is_discarded_oblock(cache, mg->new_oblock);
+ case POLICY_DEMOTE:
+ r = dm_cache_remove_mapping(cache->cmd, op->cblock);
+ if (r) {
+ DMERR_LIMIT("%s: migration failed; couldn't update on disk metadata",
+ cache_device_name(cache));
+ metadata_operation_failed(cache, "dm_cache_remove_mapping", r);
- if (writeback_mode(&cache->features) &&
- !avoid && bio_writes_complete_block(cache, bio)) {
- issue_overwrite(mg, bio);
+ mg_complete(mg, false);
return;
}
- }
- avoid ? avoid_copy(mg) : issue_copy(mg);
+ /*
+ * It would be nice if we only had to commit when a REQ_FLUSH
+ * comes through. But there's one scenario that we have to
+ * look out for:
+ *
+ * - vblock x in a cache block
+ * - domotion occurs
+ * - cache block gets reallocated and over written
+ * - crash
+ *
+ * When we recover, because there was no commit the cache will
+ * rollback to having the data for vblock x in the cache block.
+ * But the cache block has since been overwritten, so it'll end
+ * up pointing to data that was never in 'x' during the history
+ * of the device.
+ *
+ * To avoid this issue we require a commit as part of the
+ * demotion operation.
+ */
+ init_continuation(&mg->k, mg_success);
+ continue_after_commit(&cache->committer, &mg->k);
+ schedule_commit(&cache->committer);
+ break;
+
+ case POLICY_WRITEBACK:
+ mg_complete(mg, true);
+ break;
+ }
}
-static void complete_migration(struct dm_cache_migration *mg)
+static void mg_update_metadata_after_copy(struct work_struct *ws)
{
- if (mg->err)
- migration_failure(mg);
+ struct dm_cache_migration *mg = ws_to_mg(ws);
+
+ /*
+ * Did the copy succeed?
+ */
+ if (mg->k.input)
+ mg_complete(mg, false);
else
- migration_success_pre_commit(mg);
+ mg_update_metadata(ws);
}
-static void process_migrations(struct cache *cache, struct list_head *head,
- void (*fn)(struct dm_cache_migration *))
+static void mg_upgrade_lock(struct work_struct *ws)
{
- unsigned long flags;
- struct list_head list;
- struct dm_cache_migration *mg, *tmp;
+ int r;
+ struct dm_cache_migration *mg = ws_to_mg(ws);
- INIT_LIST_HEAD(&list);
- spin_lock_irqsave(&cache->lock, flags);
- list_splice_init(head, &list);
- spin_unlock_irqrestore(&cache->lock, flags);
+ /*
+ * Did the copy succeed?
+ */
+ if (mg->k.input)
+ mg_complete(mg, false);
- list_for_each_entry_safe(mg, tmp, &list, list)
- fn(mg);
-}
+ else {
+ /*
+ * Now we want the lock to prevent both reads and writes.
+ */
+ r = dm_cell_lock_promote_v2(mg->cache->prison, mg->cell,
+ READ_WRITE_LOCK_LEVEL);
+ if (r < 0)
+ mg_complete(mg, false);
-static void __queue_quiesced_migration(struct dm_cache_migration *mg)
-{
- list_add_tail(&mg->list, &mg->cache->quiesced_migrations);
+ else if (r)
+ quiesce(mg, mg_update_metadata);
+
+ else
+ mg_update_metadata(ws);
+ }
}
-static void queue_quiesced_migration(struct dm_cache_migration *mg)
+static void mg_copy(struct work_struct *ws)
{
- unsigned long flags;
- struct cache *cache = mg->cache;
+ int r;
+ struct dm_cache_migration *mg = ws_to_mg(ws);
- spin_lock_irqsave(&cache->lock, flags);
- __queue_quiesced_migration(mg);
- spin_unlock_irqrestore(&cache->lock, flags);
+ if (mg->overwrite_bio) {
+ /*
+ * It's safe to do this here, even though it's new data
+ * because all IO has been locked out of the block.
+ *
+ * mg_lock_writes() already took READ_WRITE_LOCK_LEVEL
+ * so _not_ using mg_upgrade_lock() as continutation.
+ */
+ overwrite(mg, mg_update_metadata_after_copy);
- wake_worker(cache);
-}
+ } else {
+ struct cache *cache = mg->cache;
+ struct policy_work *op = mg->op;
+ bool is_policy_promote = (op->op == POLICY_PROMOTE);
-static void queue_quiesced_migrations(struct cache *cache, struct list_head *work)
-{
- unsigned long flags;
- struct dm_cache_migration *mg, *tmp;
+ if ((!is_policy_promote && !is_dirty(cache, op->cblock)) ||
+ is_discarded_oblock(cache, op->oblock)) {
+ mg_upgrade_lock(ws);
+ return;
+ }
- spin_lock_irqsave(&cache->lock, flags);
- list_for_each_entry_safe(mg, tmp, work, list)
- __queue_quiesced_migration(mg);
- spin_unlock_irqrestore(&cache->lock, flags);
+ init_continuation(&mg->k, mg_upgrade_lock);
- wake_worker(cache);
+ r = copy(mg, is_policy_promote);
+ if (r) {
+ DMERR_LIMIT("%s: migration copy failed", cache_device_name(cache));
+ mg->k.input = BLK_STS_IOERR;
+ mg_complete(mg, false);
+ }
+ }
}
-static void check_for_quiesced_migrations(struct cache *cache,
- struct per_bio_data *pb)
+static int mg_lock_writes(struct dm_cache_migration *mg)
{
- struct list_head work;
+ int r;
+ struct dm_cell_key_v2 key;
+ struct cache *cache = mg->cache;
+ struct dm_bio_prison_cell_v2 *prealloc;
- if (!pb->all_io_entry)
- return;
+ prealloc = alloc_prison_cell(cache);
+ if (!prealloc) {
+ DMERR_LIMIT("%s: alloc_prison_cell failed", cache_device_name(cache));
+ mg_complete(mg, false);
+ return -ENOMEM;
+ }
- INIT_LIST_HEAD(&work);
- dm_deferred_entry_dec(pb->all_io_entry, &work);
+ /*
+ * Prevent writes to the block, but allow reads to continue.
+ * Unless we're using an overwrite bio, in which case we lock
+ * everything.
+ */
+ build_key(mg->op->oblock, oblock_succ(mg->op->oblock), &key);
+ r = dm_cell_lock_v2(cache->prison, &key,
+ mg->overwrite_bio ? READ_WRITE_LOCK_LEVEL : WRITE_LOCK_LEVEL,
+ prealloc, &mg->cell);
+ if (r < 0) {
+ free_prison_cell(cache, prealloc);
+ mg_complete(mg, false);
+ return r;
+ }
- if (!list_empty(&work))
- queue_quiesced_migrations(cache, &work);
-}
+ if (mg->cell != prealloc)
+ free_prison_cell(cache, prealloc);
-static void quiesce_migration(struct dm_cache_migration *mg)
-{
- if (!dm_deferred_set_add_work(mg->cache->all_io_ds, &mg->list))
- queue_quiesced_migration(mg);
+ if (r == 0)
+ mg_copy(&mg->k.ws);
+ else
+ quiesce(mg, mg_copy);
+
+ return 0;
}
-static void promote(struct cache *cache, struct prealloc *structs,
- dm_oblock_t oblock, dm_cblock_t cblock,
- struct dm_bio_prison_cell *cell)
+static int mg_start(struct cache *cache, struct policy_work *op, struct bio *bio)
{
- struct dm_cache_migration *mg = prealloc_get_migration(structs);
+ struct dm_cache_migration *mg;
+
+ if (!background_work_begin(cache)) {
+ policy_complete_background_work(cache->policy, op, false);
+ return -EPERM;
+ }
+
+ mg = alloc_migration(cache);
+ if (!mg) {
+ policy_complete_background_work(cache->policy, op, false);
+ background_work_end(cache);
+ return -ENOMEM;
+ }
+
+ memset(mg, 0, sizeof(*mg));
- mg->err = false;
- mg->discard = false;
- mg->writeback = false;
- mg->demote = false;
- mg->promote = true;
- mg->requeue_holder = true;
- mg->invalidate = false;
mg->cache = cache;
- mg->new_oblock = oblock;
- mg->cblock = cblock;
- mg->old_ocell = NULL;
- mg->new_ocell = cell;
- mg->start_jiffies = jiffies;
+ mg->op = op;
+ mg->overwrite_bio = bio;
+
+ if (!bio)
+ inc_io_migrations(cache);
- inc_io_migrations(cache);
- quiesce_migration(mg);
+ return mg_lock_writes(mg);
}
-static void writeback(struct cache *cache, struct prealloc *structs,
- dm_oblock_t oblock, dm_cblock_t cblock,
- struct dm_bio_prison_cell *cell)
+/*----------------------------------------------------------------
+ * invalidation processing
+ *--------------------------------------------------------------*/
+
+static void invalidate_complete(struct dm_cache_migration *mg, bool success)
{
- struct dm_cache_migration *mg = prealloc_get_migration(structs);
+ struct bio_list bios;
+ struct cache *cache = mg->cache;
- mg->err = false;
- mg->discard = false;
- mg->writeback = true;
- mg->demote = false;
- mg->promote = false;
- mg->requeue_holder = true;
- mg->invalidate = false;
- mg->cache = cache;
- mg->old_oblock = oblock;
- mg->cblock = cblock;
- mg->old_ocell = cell;
- mg->new_ocell = NULL;
- mg->start_jiffies = jiffies;
-
- inc_io_migrations(cache);
- quiesce_migration(mg);
-}
-
-static void demote_then_promote(struct cache *cache, struct prealloc *structs,
- dm_oblock_t old_oblock, dm_oblock_t new_oblock,
- dm_cblock_t cblock,
- struct dm_bio_prison_cell *old_ocell,
- struct dm_bio_prison_cell *new_ocell)
-{
- struct dm_cache_migration *mg = prealloc_get_migration(structs);
-
- mg->err = false;
- mg->discard = false;
- mg->writeback = false;
- mg->demote = true;
- mg->promote = true;
- mg->requeue_holder = true;
- mg->invalidate = false;
- mg->cache = cache;
- mg->old_oblock = old_oblock;
- mg->new_oblock = new_oblock;
- mg->cblock = cblock;
- mg->old_ocell = old_ocell;
- mg->new_ocell = new_ocell;
- mg->start_jiffies = jiffies;
+ bio_list_init(&bios);
+ if (dm_cell_unlock_v2(cache->prison, mg->cell, &bios))
+ free_prison_cell(cache, mg->cell);
- inc_io_migrations(cache);
- quiesce_migration(mg);
-}
+ if (!success && mg->overwrite_bio)
+ bio_io_error(mg->overwrite_bio);
-/*
- * Invalidate a cache entry. No writeback occurs; any changes in the cache
- * block are thrown away.
- */
-static void invalidate(struct cache *cache, struct prealloc *structs,
- dm_oblock_t oblock, dm_cblock_t cblock,
- struct dm_bio_prison_cell *cell)
-{
- struct dm_cache_migration *mg = prealloc_get_migration(structs);
-
- mg->err = false;
- mg->discard = false;
- mg->writeback = false;
- mg->demote = true;
- mg->promote = false;
- mg->requeue_holder = true;
- mg->invalidate = true;
- mg->cache = cache;
- mg->old_oblock = oblock;
- mg->cblock = cblock;
- mg->old_ocell = cell;
- mg->new_ocell = NULL;
- mg->start_jiffies = jiffies;
+ free_migration(mg);
+ defer_bios(cache, &bios);
- inc_io_migrations(cache);
- quiesce_migration(mg);
+ background_work_end(cache);
}
-static void discard(struct cache *cache, struct prealloc *structs,
- struct dm_bio_prison_cell *cell)
+static void invalidate_completed(struct work_struct *ws)
{
- struct dm_cache_migration *mg = prealloc_get_migration(structs);
+ struct dm_cache_migration *mg = ws_to_mg(ws);
+ invalidate_complete(mg, !mg->k.input);
+}
- mg->err = false;
- mg->discard = true;
- mg->writeback = false;
- mg->demote = false;
- mg->promote = false;
- mg->requeue_holder = false;
- mg->invalidate = false;
- mg->cache = cache;
- mg->old_ocell = NULL;
- mg->new_ocell = cell;
- mg->start_jiffies = jiffies;
+static int invalidate_cblock(struct cache *cache, dm_cblock_t cblock)
+{
+ int r = policy_invalidate_mapping(cache->policy, cblock);
+ if (!r) {
+ r = dm_cache_remove_mapping(cache->cmd, cblock);
+ if (r) {
+ DMERR_LIMIT("%s: invalidation failed; couldn't update on disk metadata",
+ cache_device_name(cache));
+ metadata_operation_failed(cache, "dm_cache_remove_mapping", r);
+ }
+
+ } else if (r == -ENODATA) {
+ /*
+ * Harmless, already unmapped.
+ */
+ r = 0;
- quiesce_migration(mg);
+ } else
+ DMERR("%s: policy_invalidate_mapping failed", cache_device_name(cache));
+
+ return r;
}
-/*----------------------------------------------------------------
- * bio processing
- *--------------------------------------------------------------*/
-static void defer_bio(struct cache *cache, struct bio *bio)
+static void invalidate_remove(struct work_struct *ws)
{
- unsigned long flags;
+ int r;
+ struct dm_cache_migration *mg = ws_to_mg(ws);
+ struct cache *cache = mg->cache;
- spin_lock_irqsave(&cache->lock, flags);
- bio_list_add(&cache->deferred_bios, bio);
- spin_unlock_irqrestore(&cache->lock, flags);
+ r = invalidate_cblock(cache, mg->invalidate_cblock);
+ if (r) {
+ invalidate_complete(mg, false);
+ return;
+ }
- wake_worker(cache);
+ init_continuation(&mg->k, invalidate_completed);
+ continue_after_commit(&cache->committer, &mg->k);
+ remap_to_origin_clear_discard(cache, mg->overwrite_bio, mg->invalidate_oblock);
+ mg->overwrite_bio = NULL;
+ schedule_commit(&cache->committer);
}
-static void process_flush_bio(struct cache *cache, struct bio *bio)
+static int invalidate_lock(struct dm_cache_migration *mg)
{
- size_t pb_data_size = get_per_bio_data_size(cache);
- struct per_bio_data *pb = get_per_bio_data(bio, pb_data_size);
+ int r;
+ struct dm_cell_key_v2 key;
+ struct cache *cache = mg->cache;
+ struct dm_bio_prison_cell_v2 *prealloc;
- BUG_ON(bio->bi_iter.bi_size);
- if (!pb->req_nr)
- remap_to_origin(cache, bio);
- else
- remap_to_cache(cache, bio, 0);
+ prealloc = alloc_prison_cell(cache);
+ if (!prealloc) {
+ invalidate_complete(mg, false);
+ return -ENOMEM;
+ }
- /*
- * REQ_PREFLUSH is not directed at any particular block so we don't
- * need to inc_ds(). REQ_FUA's are split into a write + REQ_PREFLUSH
- * by dm-core.
- */
- issue(cache, bio);
+ build_key(mg->invalidate_oblock, oblock_succ(mg->invalidate_oblock), &key);
+ r = dm_cell_lock_v2(cache->prison, &key,
+ READ_WRITE_LOCK_LEVEL, prealloc, &mg->cell);
+ if (r < 0) {
+ free_prison_cell(cache, prealloc);
+ invalidate_complete(mg, false);
+ return r;
+ }
+
+ if (mg->cell != prealloc)
+ free_prison_cell(cache, prealloc);
+
+ if (r)
+ quiesce(mg, invalidate_remove);
+
+ else {
+ /*
+ * We can't call invalidate_remove() directly here because we
+ * might still be in request context.
+ */
+ init_continuation(&mg->k, invalidate_remove);
+ queue_work(cache->wq, &mg->k.ws);
+ }
+
+ return 0;
}
-static void process_discard_bio(struct cache *cache, struct prealloc *structs,
- struct bio *bio)
+static int invalidate_start(struct cache *cache, dm_cblock_t cblock,
+ dm_oblock_t oblock, struct bio *bio)
{
- int r;
- dm_dblock_t b, e;
- struct dm_bio_prison_cell *cell_prealloc, *new_ocell;
+ struct dm_cache_migration *mg;
- calc_discard_block_range(cache, bio, &b, &e);
- if (b == e) {
- bio_endio(bio);
- return;
+ if (!background_work_begin(cache))
+ return -EPERM;
+
+ mg = alloc_migration(cache);
+ if (!mg) {
+ background_work_end(cache);
+ return -ENOMEM;
}
- cell_prealloc = prealloc_get_cell(structs);
- r = bio_detain_range(cache, dblock_to_oblock(cache, b), dblock_to_oblock(cache, e), bio, cell_prealloc,
- (cell_free_fn) prealloc_put_cell,
- structs, &new_ocell);
- if (r > 0)
- return;
+ memset(mg, 0, sizeof(*mg));
+
+ mg->cache = cache;
+ mg->overwrite_bio = bio;
+ mg->invalidate_cblock = cblock;
+ mg->invalidate_oblock = oblock;
- discard(cache, structs, new_ocell);
+ return invalidate_lock(mg);
}
-static bool spare_migration_bandwidth(struct cache *cache)
+/*----------------------------------------------------------------
+ * bio processing
+ *--------------------------------------------------------------*/
+
+enum busy {
+ IDLE,
+ BUSY
+};
+
+static enum busy spare_migration_bandwidth(struct cache *cache)
{
+ bool idle = iot_idle_for(&cache->tracker, HZ);
sector_t current_volume = (atomic_read(&cache->nr_io_migrations) + 1) *
cache->sectors_per_block;
- return current_volume < cache->migration_threshold;
+
+ if (idle && current_volume <= cache->migration_threshold)
+ return IDLE;
+ else
+ return BUSY;
}
static void inc_hit_counter(struct cache *cache, struct bio *bio)
@@ -1666,255 +1748,143 @@ static void inc_miss_counter(struct cache *cache, struct bio *bio)
/*----------------------------------------------------------------*/
-struct inc_detail {
- struct cache *cache;
- struct bio_list bios_for_issue;
- struct bio_list unhandled_bios;
- bool any_writes;
-};
-
-static void inc_fn(void *context, struct dm_bio_prison_cell *cell)
+static bool bio_writes_complete_block(struct cache *cache, struct bio *bio)
{
- struct bio *bio;
- struct inc_detail *detail = context;
- struct cache *cache = detail->cache;
-
- inc_ds(cache, cell->holder, cell);
- if (bio_data_dir(cell->holder) == WRITE)
- detail->any_writes = true;
-
- while ((bio = bio_list_pop(&cell->bios))) {
- if (discard_or_flush(bio)) {
- bio_list_add(&detail->unhandled_bios, bio);
- continue;
- }
-
- if (bio_data_dir(bio) == WRITE)
- detail->any_writes = true;
-
- bio_list_add(&detail->bios_for_issue, bio);
- inc_ds(cache, bio, cell);
- }
+ return (bio_data_dir(bio) == WRITE) &&
+ (bio->bi_iter.bi_size == (cache->sectors_per_block << SECTOR_SHIFT));
}
-// FIXME: refactor these two
-static void remap_cell_to_origin_clear_discard(struct cache *cache,
- struct dm_bio_prison_cell *cell,
- dm_oblock_t oblock, bool issue_holder)
+static bool optimisable_bio(struct cache *cache, struct bio *bio, dm_oblock_t block)
{
- struct bio *bio;
- unsigned long flags;
- struct inc_detail detail;
-
- detail.cache = cache;
- bio_list_init(&detail.bios_for_issue);
- bio_list_init(&detail.unhandled_bios);
- detail.any_writes = false;
-
- spin_lock_irqsave(&cache->lock, flags);
- dm_cell_visit_release(cache->prison, inc_fn, &detail, cell);
- bio_list_merge(&cache->deferred_bios, &detail.unhandled_bios);
- spin_unlock_irqrestore(&cache->lock, flags);
-
- remap_to_origin(cache, cell->holder);
- if (issue_holder)
- issue(cache, cell->holder);
- else
- accounted_begin(cache, cell->holder);
-
- if (detail.any_writes)
- clear_discard(cache, oblock_to_dblock(cache, oblock));
-
- while ((bio = bio_list_pop(&detail.bios_for_issue))) {
- remap_to_origin(cache, bio);
- issue(cache, bio);
- }
-
- free_prison_cell(cache, cell);
+ return writeback_mode(&cache->features) &&
+ (is_discarded_oblock(cache, block) || bio_writes_complete_block(cache, bio));
}
-static void remap_cell_to_cache_dirty(struct cache *cache, struct dm_bio_prison_cell *cell,
- dm_oblock_t oblock, dm_cblock_t cblock, bool issue_holder)
+static int map_bio(struct cache *cache, struct bio *bio, dm_oblock_t block,
+ bool *commit_needed)
{
- struct bio *bio;
- unsigned long flags;
- struct inc_detail detail;
-
- detail.cache = cache;
- bio_list_init(&detail.bios_for_issue);
- bio_list_init(&detail.unhandled_bios);
- detail.any_writes = false;
-
- spin_lock_irqsave(&cache->lock, flags);
- dm_cell_visit_release(cache->prison, inc_fn, &detail, cell);
- bio_list_merge(&cache->deferred_bios, &detail.unhandled_bios);
- spin_unlock_irqrestore(&cache->lock, flags);
-
- remap_to_cache(cache, cell->holder, cblock);
- if (issue_holder)
- issue(cache, cell->holder);
- else
- accounted_begin(cache, cell->holder);
+ int r, data_dir;
+ bool rb, background_queued;
+ dm_cblock_t cblock;
+ size_t pb_data_size = get_per_bio_data_size(cache);
+ struct per_bio_data *pb = get_per_bio_data(bio, pb_data_size);
- if (detail.any_writes) {
- set_dirty(cache, oblock, cblock);
- clear_discard(cache, oblock_to_dblock(cache, oblock));
- }
+ *commit_needed = false;
- while ((bio = bio_list_pop(&detail.bios_for_issue))) {
- remap_to_cache(cache, bio, cblock);
- issue(cache, bio);
+ rb = bio_detain_shared(cache, block, bio);
+ if (!rb) {
+ /*
+ * An exclusive lock is held for this block, so we have to
+ * wait. We set the commit_needed flag so the current
+ * transaction will be committed asap, allowing this lock
+ * to be dropped.
+ */
+ *commit_needed = true;
+ return DM_MAPIO_SUBMITTED;
}
- free_prison_cell(cache, cell);
-}
+ data_dir = bio_data_dir(bio);
-/*----------------------------------------------------------------*/
+ if (optimisable_bio(cache, bio, block)) {
+ struct policy_work *op = NULL;
-struct old_oblock_lock {
- struct policy_locker locker;
- struct cache *cache;
- struct prealloc *structs;
- struct dm_bio_prison_cell *cell;
-};
-
-static int null_locker(struct policy_locker *locker, dm_oblock_t b)
-{
- /* This should never be called */
- BUG();
- return 0;
-}
+ r = policy_lookup_with_work(cache->policy, block, &cblock, data_dir, true, &op);
+ if (unlikely(r && r != -ENOENT)) {
+ DMERR_LIMIT("%s: policy_lookup_with_work() failed with r = %d",
+ cache_device_name(cache), r);
+ bio_io_error(bio);
+ return DM_MAPIO_SUBMITTED;
+ }
-static int cell_locker(struct policy_locker *locker, dm_oblock_t b)
-{
- struct old_oblock_lock *l = container_of(locker, struct old_oblock_lock, locker);
- struct dm_bio_prison_cell *cell_prealloc = prealloc_get_cell(l->structs);
+ if (r == -ENOENT && op) {
+ bio_drop_shared_lock(cache, bio);
+ BUG_ON(op->op != POLICY_PROMOTE);
+ mg_start(cache, op, bio);
+ return DM_MAPIO_SUBMITTED;
+ }
+ } else {
+ r = policy_lookup(cache->policy, block, &cblock, data_dir, false, &background_queued);
+ if (unlikely(r && r != -ENOENT)) {
+ DMERR_LIMIT("%s: policy_lookup() failed with r = %d",
+ cache_device_name(cache), r);
+ bio_io_error(bio);
+ return DM_MAPIO_SUBMITTED;
+ }
- return bio_detain(l->cache, b, NULL, cell_prealloc,
- (cell_free_fn) prealloc_put_cell,
- l->structs, &l->cell);
-}
+ if (background_queued)
+ wake_migration_worker(cache);
+ }
-static void process_cell(struct cache *cache, struct prealloc *structs,
- struct dm_bio_prison_cell *new_ocell)
-{
- int r;
- bool release_cell = true;
- struct bio *bio = new_ocell->holder;
- dm_oblock_t block = get_bio_block(cache, bio);
- struct policy_result lookup_result;
- bool passthrough = passthrough_mode(&cache->features);
- bool fast_promotion, can_migrate;
- struct old_oblock_lock ool;
-
- fast_promotion = is_discarded_oblock(cache, block) || bio_writes_complete_block(cache, bio);
- can_migrate = !passthrough && (fast_promotion || spare_migration_bandwidth(cache));
-
- ool.locker.fn = cell_locker;
- ool.cache = cache;
- ool.structs = structs;
- ool.cell = NULL;
- r = policy_map(cache->policy, block, true, can_migrate, fast_promotion,
- bio, &ool.locker, &lookup_result);
-
- if (r == -EWOULDBLOCK)
- /* migration has been denied */
- lookup_result.op = POLICY_MISS;
-
- switch (lookup_result.op) {
- case POLICY_HIT:
- if (passthrough) {
- inc_miss_counter(cache, bio);
+ if (r == -ENOENT) {
+ /*
+ * Miss.
+ */
+ inc_miss_counter(cache, bio);
+ if (pb->req_nr == 0) {
+ accounted_begin(cache, bio);
+ remap_to_origin_clear_discard(cache, bio, block);
+ } else {
/*
- * Passthrough always maps to the origin,
- * invalidating any cache blocks that are written
- * to.
+ * This is a duplicate writethrough io that is no
+ * longer needed because the block has been demoted.
*/
+ bio_endio(bio);
+ return DM_MAPIO_SUBMITTED;
+ }
+ } else {
+ /*
+ * Hit.
+ */
+ inc_hit_counter(cache, bio);
+ /*
+ * Passthrough always maps to the origin, invalidating any
+ * cache blocks that are written to.
+ */
+ if (passthrough_mode(&cache->features)) {
if (bio_data_dir(bio) == WRITE) {
+ bio_drop_shared_lock(cache, bio);
atomic_inc(&cache->stats.demotion);
- invalidate(cache, structs, block, lookup_result.cblock, new_ocell);
- release_cell = false;
-
- } else {
- /* FIXME: factor out issue_origin() */
+ invalidate_start(cache, cblock, block, bio);
+ } else
remap_to_origin_clear_discard(cache, bio, block);
- inc_and_issue(cache, bio, new_ocell);
- }
+
} else {
- inc_hit_counter(cache, bio);
-
- if (bio_data_dir(bio) == WRITE &&
- writethrough_mode(&cache->features) &&
- !is_dirty(cache, lookup_result.cblock)) {
- remap_to_origin_then_cache(cache, bio, block, lookup_result.cblock);
- inc_and_issue(cache, bio, new_ocell);
-
- } else {
- remap_cell_to_cache_dirty(cache, new_ocell, block, lookup_result.cblock, true);
- release_cell = false;
- }
+ if (bio_data_dir(bio) == WRITE && writethrough_mode(&cache->features) &&
+ !is_dirty(cache, cblock)) {
+ remap_to_origin_then_cache(cache, bio, block, cblock);
+ accounted_begin(cache, bio);
+ } else
+ remap_to_cache_dirty(cache, bio, block, cblock);
}
-
- break;
-
- case POLICY_MISS:
- inc_miss_counter(cache, bio);
- remap_cell_to_origin_clear_discard(cache, new_ocell, block, true);
- release_cell = false;
- break;
-
- case POLICY_NEW:
- atomic_inc(&cache->stats.promotion);
- promote(cache, structs, block, lookup_result.cblock, new_ocell);
- release_cell = false;
- break;
-
- case POLICY_REPLACE:
- atomic_inc(&cache->stats.demotion);
- atomic_inc(&cache->stats.promotion);
- demote_then_promote(cache, structs, lookup_result.old_oblock,
- block, lookup_result.cblock,
- ool.cell, new_ocell);
- release_cell = false;
- break;
-
- default:
- DMERR_LIMIT("%s: %s: erroring bio, unknown policy op: %u",
- cache_device_name(cache), __func__,
- (unsigned) lookup_result.op);
- bio_io_error(bio);
}
- if (release_cell)
- cell_defer(cache, new_ocell, false);
-}
-
-static void process_bio(struct cache *cache, struct prealloc *structs,
- struct bio *bio)
-{
- int r;
- dm_oblock_t block = get_bio_block(cache, bio);
- struct dm_bio_prison_cell *cell_prealloc, *new_ocell;
-
/*
- * Check to see if that block is currently migrating.
+ * dm core turns FUA requests into a separate payload and FLUSH req.
*/
- cell_prealloc = prealloc_get_cell(structs);
- r = bio_detain(cache, block, bio, cell_prealloc,
- (cell_free_fn) prealloc_put_cell,
- structs, &new_ocell);
- if (r > 0)
- return;
+ if (bio->bi_opf & REQ_FUA) {
+ /*
+ * issue_after_commit will call accounted_begin a second time. So
+ * we call accounted_complete() to avoid double accounting.
+ */
+ accounted_complete(cache, bio);
+ issue_after_commit(&cache->committer, bio);
+ *commit_needed = true;
+ return DM_MAPIO_SUBMITTED;
+ }
- process_cell(cache, structs, new_ocell);
+ return DM_MAPIO_REMAPPED;
}
-static int need_commit_due_to_time(struct cache *cache)
+static bool process_bio(struct cache *cache, struct bio *bio)
{
- return jiffies < cache->last_commit_jiffies ||
- jiffies > cache->last_commit_jiffies + COMMIT_PERIOD;
+ bool commit_needed;
+
+ if (map_bio(cache, bio, get_bio_block(cache, bio), &commit_needed) == DM_MAPIO_REMAPPED)
+ generic_make_request(bio);
+
+ return commit_needed;
}
/*
@@ -1935,123 +1905,88 @@ static int commit(struct cache *cache, bool clean_shutdown)
return r;
}
-static int commit_if_needed(struct cache *cache)
+/*
+ * Used by the batcher.
+ */
+static blk_status_t commit_op(void *context)
{
- int r = 0;
+ struct cache *cache = context;
- if ((cache->commit_requested || need_commit_due_to_time(cache)) &&
- dm_cache_changed_this_transaction(cache->cmd)) {
- r = commit(cache, false);
- cache->commit_requested = false;
- cache->last_commit_jiffies = jiffies;
- }
+ if (dm_cache_changed_this_transaction(cache->cmd))
+ return errno_to_blk_status(commit(cache, false));
- return r;
+ return 0;
}
-static void process_deferred_bios(struct cache *cache)
-{
- bool prealloc_used = false;
- unsigned long flags;
- struct bio_list bios;
- struct bio *bio;
- struct prealloc structs;
-
- memset(&structs, 0, sizeof(structs));
- bio_list_init(&bios);
-
- spin_lock_irqsave(&cache->lock, flags);
- bio_list_merge(&bios, &cache->deferred_bios);
- bio_list_init(&cache->deferred_bios);
- spin_unlock_irqrestore(&cache->lock, flags);
-
- while (!bio_list_empty(&bios)) {
- /*
- * If we've got no free migration structs, and processing
- * this bio might require one, we pause until there are some
- * prepared mappings to process.
- */
- prealloc_used = true;
- if (prealloc_data_structs(cache, &structs)) {
- spin_lock_irqsave(&cache->lock, flags);
- bio_list_merge(&cache->deferred_bios, &bios);
- spin_unlock_irqrestore(&cache->lock, flags);
- break;
- }
+/*----------------------------------------------------------------*/
- bio = bio_list_pop(&bios);
+static bool process_flush_bio(struct cache *cache, struct bio *bio)
+{
+ size_t pb_data_size = get_per_bio_data_size(cache);
+ struct per_bio_data *pb = get_per_bio_data(bio, pb_data_size);
- if (bio->bi_opf & REQ_PREFLUSH)
- process_flush_bio(cache, bio);
- else if (bio_op(bio) == REQ_OP_DISCARD)
- process_discard_bio(cache, &structs, bio);
- else
- process_bio(cache, &structs, bio);
- }
+ if (!pb->req_nr)
+ remap_to_origin(cache, bio);
+ else
+ remap_to_cache(cache, bio, 0);
- if (prealloc_used)
- prealloc_free_structs(cache, &structs);
+ issue_after_commit(&cache->committer, bio);
+ return true;
}
-static void process_deferred_cells(struct cache *cache)
+static bool process_discard_bio(struct cache *cache, struct bio *bio)
{
- bool prealloc_used = false;
- unsigned long flags;
- struct dm_bio_prison_cell *cell, *tmp;
- struct list_head cells;
- struct prealloc structs;
-
- memset(&structs, 0, sizeof(structs));
-
- INIT_LIST_HEAD(&cells);
-
- spin_lock_irqsave(&cache->lock, flags);
- list_splice_init(&cache->deferred_cells, &cells);
- spin_unlock_irqrestore(&cache->lock, flags);
-
- list_for_each_entry_safe(cell, tmp, &cells, user_list) {
- /*
- * If we've got no free migration structs, and processing
- * this bio might require one, we pause until there are some
- * prepared mappings to process.
- */
- prealloc_used = true;
- if (prealloc_data_structs(cache, &structs)) {
- spin_lock_irqsave(&cache->lock, flags);
- list_splice(&cells, &cache->deferred_cells);
- spin_unlock_irqrestore(&cache->lock, flags);
- break;
- }
+ dm_dblock_t b, e;
- process_cell(cache, &structs, cell);
+ // FIXME: do we need to lock the region? Or can we just assume the
+ // user wont be so foolish as to issue discard concurrently with
+ // other IO?
+ calc_discard_block_range(cache, bio, &b, &e);
+ while (b != e) {
+ set_discard(cache, b);
+ b = to_dblock(from_dblock(b) + 1);
}
- if (prealloc_used)
- prealloc_free_structs(cache, &structs);
+ bio_endio(bio);
+
+ return false;
}
-static void process_deferred_flush_bios(struct cache *cache, bool submit_bios)
+static void process_deferred_bios(struct work_struct *ws)
{
+ struct cache *cache = container_of(ws, struct cache, deferred_bio_worker);
+
unsigned long flags;
+ bool commit_needed = false;
struct bio_list bios;
struct bio *bio;
bio_list_init(&bios);
spin_lock_irqsave(&cache->lock, flags);
- bio_list_merge(&bios, &cache->deferred_flush_bios);
- bio_list_init(&cache->deferred_flush_bios);
+ bio_list_merge(&bios, &cache->deferred_bios);
+ bio_list_init(&cache->deferred_bios);
spin_unlock_irqrestore(&cache->lock, flags);
- /*
- * These bios have already been through inc_ds()
- */
- while ((bio = bio_list_pop(&bios)))
- submit_bios ? accounted_request(cache, bio) : bio_io_error(bio);
+ while ((bio = bio_list_pop(&bios))) {
+ if (bio->bi_opf & REQ_PREFLUSH)
+ commit_needed = process_flush_bio(cache, bio) || commit_needed;
+
+ else if (bio_op(bio) == REQ_OP_DISCARD)
+ commit_needed = process_discard_bio(cache, bio) || commit_needed;
+
+ else
+ commit_needed = process_bio(cache, bio) || commit_needed;
+ }
+
+ if (commit_needed)
+ schedule_commit(&cache->committer);
}
-static void process_deferred_writethrough_bios(struct cache *cache)
+static void process_deferred_writethrough_bios(struct work_struct *ws)
{
+ struct cache *cache = container_of(ws, struct cache, deferred_writethrough_worker);
+
unsigned long flags;
struct bio_list bios;
struct bio *bio;
@@ -2064,153 +1999,15 @@ static void process_deferred_writethrough_bios(struct cache *cache)
spin_unlock_irqrestore(&cache->lock, flags);
/*
- * These bios have already been through inc_ds()
+ * These bios have already been through accounted_begin()
*/
while ((bio = bio_list_pop(&bios)))
- accounted_request(cache, bio);
-}
-
-static void writeback_some_dirty_blocks(struct cache *cache)
-{
- bool prealloc_used = false;
- dm_oblock_t oblock;
- dm_cblock_t cblock;
- struct prealloc structs;
- struct dm_bio_prison_cell *old_ocell;
- bool busy = !iot_idle_for(&cache->origin_tracker, HZ);
-
- memset(&structs, 0, sizeof(structs));
-
- while (spare_migration_bandwidth(cache)) {
- if (policy_writeback_work(cache->policy, &oblock, &cblock, busy))
- break; /* no work to do */
-
- prealloc_used = true;
- if (prealloc_data_structs(cache, &structs) ||
- get_cell(cache, oblock, &structs, &old_ocell)) {
- policy_set_dirty(cache->policy, oblock);
- break;
- }
-
- writeback(cache, &structs, oblock, cblock, old_ocell);
- }
-
- if (prealloc_used)
- prealloc_free_structs(cache, &structs);
-}
-
-/*----------------------------------------------------------------
- * Invalidations.
- * Dropping something from the cache *without* writing back.
- *--------------------------------------------------------------*/
-
-static void process_invalidation_request(struct cache *cache, struct invalidation_request *req)
-{
- int r = 0;
- uint64_t begin = from_cblock(req->cblocks->begin);
- uint64_t end = from_cblock(req->cblocks->end);
-
- while (begin != end) {
- r = policy_remove_cblock(cache->policy, to_cblock(begin));
- if (!r) {
- r = dm_cache_remove_mapping(cache->cmd, to_cblock(begin));
- if (r) {
- metadata_operation_failed(cache, "dm_cache_remove_mapping", r);
- break;
- }
-
- } else if (r == -ENODATA) {
- /* harmless, already unmapped */
- r = 0;
-
- } else {
- DMERR("%s: policy_remove_cblock failed", cache_device_name(cache));
- break;
- }
-
- begin++;
- }
-
- cache->commit_requested = true;
-
- req->err = r;
- atomic_set(&req->complete, 1);
-
- wake_up(&req->result_wait);
-}
-
-static void process_invalidation_requests(struct cache *cache)
-{
- struct list_head list;
- struct invalidation_request *req, *tmp;
-
- INIT_LIST_HEAD(&list);
- spin_lock(&cache->invalidation_lock);
- list_splice_init(&cache->invalidation_requests, &list);
- spin_unlock(&cache->invalidation_lock);
-
- list_for_each_entry_safe (req, tmp, &list, list)
- process_invalidation_request(cache, req);
+ generic_make_request(bio);
}
/*----------------------------------------------------------------
* Main worker loop
*--------------------------------------------------------------*/
-static bool is_quiescing(struct cache *cache)
-{
- return atomic_read(&cache->quiescing);
-}
-
-static void ack_quiescing(struct cache *cache)
-{
- if (is_quiescing(cache)) {
- atomic_inc(&cache->quiescing_ack);
- wake_up(&cache->quiescing_wait);
- }
-}
-
-static void wait_for_quiescing_ack(struct cache *cache)
-{
- wait_event(cache->quiescing_wait, atomic_read(&cache->quiescing_ack));
-}
-
-static void start_quiescing(struct cache *cache)
-{
- atomic_inc(&cache->quiescing);
- wait_for_quiescing_ack(cache);
-}
-
-static void stop_quiescing(struct cache *cache)
-{
- atomic_set(&cache->quiescing, 0);
- atomic_set(&cache->quiescing_ack, 0);
-}
-
-static void wait_for_migrations(struct cache *cache)
-{
- wait_event(cache->migration_wait, !atomic_read(&cache->nr_allocated_migrations));
-}
-
-static void stop_worker(struct cache *cache)
-{
- cancel_delayed_work(&cache->waker);
- flush_workqueue(cache->wq);
-}
-
-static void requeue_deferred_cells(struct cache *cache)
-{
- unsigned long flags;
- struct list_head cells;
- struct dm_bio_prison_cell *cell, *tmp;
-
- INIT_LIST_HEAD(&cells);
- spin_lock_irqsave(&cache->lock, flags);
- list_splice_init(&cache->deferred_cells, &cells);
- spin_unlock_irqrestore(&cache->lock, flags);
-
- list_for_each_entry_safe(cell, tmp, &cells, user_list)
- cell_requeue(cache, cell);
-}
static void requeue_deferred_bios(struct cache *cache)
{
@@ -2222,58 +2019,11 @@ static void requeue_deferred_bios(struct cache *cache)
bio_list_init(&cache->deferred_bios);
while ((bio = bio_list_pop(&bios))) {
- bio->bi_error = DM_ENDIO_REQUEUE;
+ bio->bi_status = BLK_STS_DM_REQUEUE;
bio_endio(bio);
}
}
-static int more_work(struct cache *cache)
-{
- if (is_quiescing(cache))
- return !list_empty(&cache->quiesced_migrations) ||
- !list_empty(&cache->completed_migrations) ||
- !list_empty(&cache->need_commit_migrations);
- else
- return !bio_list_empty(&cache->deferred_bios) ||
- !list_empty(&cache->deferred_cells) ||
- !bio_list_empty(&cache->deferred_flush_bios) ||
- !bio_list_empty(&cache->deferred_writethrough_bios) ||
- !list_empty(&cache->quiesced_migrations) ||
- !list_empty(&cache->completed_migrations) ||
- !list_empty(&cache->need_commit_migrations) ||
- cache->invalidate;
-}
-
-static void do_worker(struct work_struct *ws)
-{
- struct cache *cache = container_of(ws, struct cache, worker);
-
- do {
- if (!is_quiescing(cache)) {
- writeback_some_dirty_blocks(cache);
- process_deferred_writethrough_bios(cache);
- process_deferred_bios(cache);
- process_deferred_cells(cache);
- process_invalidation_requests(cache);
- }
-
- process_migrations(cache, &cache->quiesced_migrations, issue_copy_or_discard);
- process_migrations(cache, &cache->completed_migrations, complete_migration);
-
- if (commit_if_needed(cache)) {
- process_deferred_flush_bios(cache, false);
- process_migrations(cache, &cache->need_commit_migrations, migration_failure);
- } else {
- process_deferred_flush_bios(cache, true);
- process_migrations(cache, &cache->need_commit_migrations,
- migration_success_post_commit);
- }
-
- ack_quiescing(cache);
-
- } while (more_work(cache));
-}
-
/*
* We want to commit periodically so that not too much
* unwritten metadata builds up.
@@ -2281,25 +2031,37 @@ static void do_worker(struct work_struct *ws)
static void do_waker(struct work_struct *ws)
{
struct cache *cache = container_of(to_delayed_work(ws), struct cache, waker);
+
policy_tick(cache->policy, true);
- wake_worker(cache);
+ wake_migration_worker(cache);
+ schedule_commit(&cache->committer);
queue_delayed_work(cache->wq, &cache->waker, COMMIT_PERIOD);
}
-/*----------------------------------------------------------------*/
-
-static int is_congested(struct dm_dev *dev, int bdi_bits)
+static void check_migrations(struct work_struct *ws)
{
- struct request_queue *q = bdev_get_queue(dev->bdev);
- return bdi_congested(&q->backing_dev_info, bdi_bits);
-}
+ int r;
+ struct policy_work *op;
+ struct cache *cache = container_of(ws, struct cache, migration_worker);
+ enum busy b;
-static int cache_is_congested(struct dm_target_callbacks *cb, int bdi_bits)
-{
- struct cache *cache = container_of(cb, struct cache, callbacks);
+ for (;;) {
+ b = spare_migration_bandwidth(cache);
- return is_congested(cache->origin_dev, bdi_bits) ||
- is_congested(cache->cache_dev, bdi_bits);
+ r = policy_get_background_work(cache->policy, b == IDLE, &op);
+ if (r == -ENODATA)
+ break;
+
+ if (r) {
+ DMERR_LIMIT("%s: policy_background_work failed",
+ cache_device_name(cache));
+ break;
+ }
+
+ r = mg_start(cache, op, NULL);
+ if (r)
+ break;
+ }
}
/*----------------------------------------------------------------
@@ -2316,11 +2078,8 @@ static void destroy(struct cache *cache)
mempool_destroy(cache->migration_pool);
- if (cache->all_io_ds)
- dm_deferred_set_destroy(cache->all_io_ds);
-
if (cache->prison)
- dm_bio_prison_destroy(cache->prison);
+ dm_bio_prison_destroy_v2(cache->prison);
if (cache->wq)
destroy_workqueue(cache->wq);
@@ -2541,13 +2300,14 @@ static void init_features(struct cache_features *cf)
{
cf->mode = CM_WRITE;
cf->io_mode = CM_IO_WRITEBACK;
+ cf->metadata_version = 1;
}
static int parse_features(struct cache_args *ca, struct dm_arg_set *as,
char **error)
{
static struct dm_arg _args[] = {
- {0, 1, "Invalid number of cache feature arguments"},
+ {0, 2, "Invalid number of cache feature arguments"},
};
int r;
@@ -2573,6 +2333,9 @@ static int parse_features(struct cache_args *ca, struct dm_arg_set *as,
else if (!strcasecmp(arg, "passthrough"))
cf->io_mode = CM_IO_PASSTHROUGH;
+ else if (!strcasecmp(arg, "metadata2"))
+ cf->metadata_version = 2;
+
else {
*error = "Unrecognised cache feature requested";
return -EINVAL;
@@ -2709,6 +2472,7 @@ static int create_cache_policy(struct cache *cache, struct cache_args *ca,
return PTR_ERR(p);
}
cache->policy = p;
+ BUG_ON(!cache->policy);
return 0;
}
@@ -2752,6 +2516,20 @@ static void set_cache_size(struct cache *cache, dm_cblock_t size)
cache->cache_size = size;
}
+static int is_congested(struct dm_dev *dev, int bdi_bits)
+{
+ struct request_queue *q = bdev_get_queue(dev->bdev);
+ return bdi_congested(q->backing_dev_info, bdi_bits);
+}
+
+static int cache_is_congested(struct dm_target_callbacks *cb, int bdi_bits)
+{
+ struct cache *cache = container_of(cb, struct cache, callbacks);
+
+ return is_congested(cache->origin_dev, bdi_bits) ||
+ is_congested(cache->cache_dev, bdi_bits);
+}
+
#define DEFAULT_MIGRATION_THRESHOLD 2048
static int cache_create(struct cache_args *ca, struct cache **result)
@@ -2775,7 +2553,6 @@ static int cache_create(struct cache_args *ca, struct cache **result)
ti->num_discard_bios = 1;
ti->discards_supported = true;
- ti->discard_zeroes_data_unsupported = true;
ti->split_discard_bios = false;
cache->features = ca->features;
@@ -2790,7 +2567,6 @@ static int cache_create(struct cache_args *ca, struct cache **result)
ca->metadata_dev = ca->origin_dev = ca->cache_dev = NULL;
- /* FIXME: factor out this whole section */
origin_blocks = cache->origin_sectors = ca->origin_sectors;
origin_blocks = block_div(origin_blocks, ca->block_size);
cache->origin_blocks = to_oblock(origin_blocks);
@@ -2827,7 +2603,8 @@ static int cache_create(struct cache_args *ca, struct cache **result)
cmd = dm_cache_metadata_open(cache->metadata_dev->bdev,
ca->block_size, may_format,
- dm_cache_policy_get_hint_size(cache->policy));
+ dm_cache_policy_get_hint_size(cache->policy),
+ ca->features.metadata_version);
if (IS_ERR(cmd)) {
*error = "Error creating metadata object";
r = PTR_ERR(cmd);
@@ -2855,24 +2632,18 @@ static int cache_create(struct cache_args *ca, struct cache **result)
r = -EINVAL;
goto bad;
}
+
+ policy_allow_migrations(cache->policy, false);
}
spin_lock_init(&cache->lock);
INIT_LIST_HEAD(&cache->deferred_cells);
bio_list_init(&cache->deferred_bios);
- bio_list_init(&cache->deferred_flush_bios);
bio_list_init(&cache->deferred_writethrough_bios);
- INIT_LIST_HEAD(&cache->quiesced_migrations);
- INIT_LIST_HEAD(&cache->completed_migrations);
- INIT_LIST_HEAD(&cache->need_commit_migrations);
atomic_set(&cache->nr_allocated_migrations, 0);
atomic_set(&cache->nr_io_migrations, 0);
init_waitqueue_head(&cache->migration_wait);
- init_waitqueue_head(&cache->quiescing_wait);
- atomic_set(&cache->quiescing, 0);
- atomic_set(&cache->quiescing_ack, 0);
-
r = -ENOMEM;
atomic_set(&cache->nr_dirty, 0);
cache->dirty_bitset = alloc_bitset(from_cblock(cache->cache_size));
@@ -2901,27 +2672,23 @@ static int cache_create(struct cache_args *ca, struct cache **result)
goto bad;
}
- cache->wq = alloc_ordered_workqueue("dm-" DM_MSG_PREFIX, WQ_MEM_RECLAIM);
+ cache->wq = alloc_workqueue("dm-" DM_MSG_PREFIX, WQ_MEM_RECLAIM, 0);
if (!cache->wq) {
*error = "could not create workqueue for metadata object";
goto bad;
}
- INIT_WORK(&cache->worker, do_worker);
+ INIT_WORK(&cache->deferred_bio_worker, process_deferred_bios);
+ INIT_WORK(&cache->deferred_writethrough_worker,
+ process_deferred_writethrough_bios);
+ INIT_WORK(&cache->migration_worker, check_migrations);
INIT_DELAYED_WORK(&cache->waker, do_waker);
- cache->last_commit_jiffies = jiffies;
- cache->prison = dm_bio_prison_create();
+ cache->prison = dm_bio_prison_create_v2(cache->wq);
if (!cache->prison) {
*error = "could not create bio prison";
goto bad;
}
- cache->all_io_ds = dm_deferred_set_create();
- if (!cache->all_io_ds) {
- *error = "could not create all_io deferred set";
- goto bad;
- }
-
cache->migration_pool = mempool_create_slab_pool(MIGRATION_POOL_SIZE,
migration_cache);
if (!cache->migration_pool) {
@@ -2948,11 +2715,15 @@ static int cache_create(struct cache_args *ca, struct cache **result)
spin_lock_init(&cache->invalidation_lock);
INIT_LIST_HEAD(&cache->invalidation_requests);
- iot_init(&cache->origin_tracker);
+ batcher_init(&cache->committer, commit_op, cache,
+ issue_op, cache, cache->wq);
+ iot_init(&cache->tracker);
+
+ init_rwsem(&cache->background_work_lock);
+ prevent_background_work(cache);
*result = cache;
return 0;
-
bad:
destroy(cache);
return r;
@@ -3010,7 +2781,6 @@ static int cache_ctr(struct dm_target *ti, unsigned argc, char **argv)
}
ti->private = cache;
-
out:
destroy_cache_args(ca);
return r;
@@ -3023,17 +2793,11 @@ static int cache_map(struct dm_target *ti, struct bio *bio)
struct cache *cache = ti->private;
int r;
- struct dm_bio_prison_cell *cell = NULL;
+ bool commit_needed;
dm_oblock_t block = get_bio_block(cache, bio);
size_t pb_data_size = get_per_bio_data_size(cache);
- bool can_migrate = false;
- bool fast_promotion;
- struct policy_result lookup_result;
- struct per_bio_data *pb = init_per_bio_data(bio, pb_data_size);
- struct old_oblock_lock ool;
-
- ool.locker.fn = null_locker;
+ init_per_bio_data(bio, pb_data_size);
if (unlikely(from_oblock(block) >= from_oblock(cache->origin_blocks))) {
/*
* This can only occur if the io goes to a partial block at
@@ -3050,106 +2814,15 @@ static int cache_map(struct dm_target *ti, struct bio *bio)
return DM_MAPIO_SUBMITTED;
}
- /*
- * Check to see if that block is currently migrating.
- */
- cell = alloc_prison_cell(cache);
- if (!cell) {
- defer_bio(cache, bio);
- return DM_MAPIO_SUBMITTED;
- }
-
- r = bio_detain(cache, block, bio, cell,
- (cell_free_fn) free_prison_cell,
- cache, &cell);
- if (r) {
- if (r < 0)
- defer_bio(cache, bio);
-
- return DM_MAPIO_SUBMITTED;
- }
-
- fast_promotion = is_discarded_oblock(cache, block) || bio_writes_complete_block(cache, bio);
-
- r = policy_map(cache->policy, block, false, can_migrate, fast_promotion,
- bio, &ool.locker, &lookup_result);
- if (r == -EWOULDBLOCK) {
- cell_defer(cache, cell, true);
- return DM_MAPIO_SUBMITTED;
-
- } else if (r) {
- DMERR_LIMIT("%s: Unexpected return from cache replacement policy: %d",
- cache_device_name(cache), r);
- cell_defer(cache, cell, false);
- bio_io_error(bio);
- return DM_MAPIO_SUBMITTED;
- }
-
- r = DM_MAPIO_REMAPPED;
- switch (lookup_result.op) {
- case POLICY_HIT:
- if (passthrough_mode(&cache->features)) {
- if (bio_data_dir(bio) == WRITE) {
- /*
- * We need to invalidate this block, so
- * defer for the worker thread.
- */
- cell_defer(cache, cell, true);
- r = DM_MAPIO_SUBMITTED;
-
- } else {
- inc_miss_counter(cache, bio);
- remap_to_origin_clear_discard(cache, bio, block);
- accounted_begin(cache, bio);
- inc_ds(cache, bio, cell);
- // FIXME: we want to remap hits or misses straight
- // away rather than passing over to the worker.
- cell_defer(cache, cell, false);
- }
-
- } else {
- inc_hit_counter(cache, bio);
- if (bio_data_dir(bio) == WRITE && writethrough_mode(&cache->features) &&
- !is_dirty(cache, lookup_result.cblock)) {
- remap_to_origin_then_cache(cache, bio, block, lookup_result.cblock);
- accounted_begin(cache, bio);
- inc_ds(cache, bio, cell);
- cell_defer(cache, cell, false);
-
- } else
- remap_cell_to_cache_dirty(cache, cell, block, lookup_result.cblock, false);
- }
- break;
-
- case POLICY_MISS:
- inc_miss_counter(cache, bio);
- if (pb->req_nr != 0) {
- /*
- * This is a duplicate writethrough io that is no
- * longer needed because the block has been demoted.
- */
- bio_endio(bio);
- // FIXME: remap everything as a miss
- cell_defer(cache, cell, false);
- r = DM_MAPIO_SUBMITTED;
-
- } else
- remap_cell_to_origin_clear_discard(cache, cell, block, false);
- break;
-
- default:
- DMERR_LIMIT("%s: %s: erroring bio: unknown policy op: %u",
- cache_device_name(cache), __func__,
- (unsigned) lookup_result.op);
- cell_defer(cache, cell, false);
- bio_io_error(bio);
- r = DM_MAPIO_SUBMITTED;
- }
+ r = map_bio(cache, bio, block, &commit_needed);
+ if (commit_needed)
+ schedule_commit(&cache->committer);
return r;
}
-static int cache_end_io(struct dm_target *ti, struct bio *bio, int error)
+static int cache_end_io(struct dm_target *ti, struct bio *bio,
+ blk_status_t *error)
{
struct cache *cache = ti->private;
unsigned long flags;
@@ -3164,29 +2837,24 @@ static int cache_end_io(struct dm_target *ti, struct bio *bio, int error)
spin_unlock_irqrestore(&cache->lock, flags);
}
- check_for_quiesced_migrations(cache, pb);
+ bio_drop_shared_lock(cache, bio);
accounted_complete(cache, bio);
- return 0;
+ return DM_ENDIO_DONE;
}
static int write_dirty_bitset(struct cache *cache)
{
- unsigned i, r;
+ int r;
if (get_cache_mode(cache) >= CM_READ_ONLY)
return -EINVAL;
- for (i = 0; i < from_cblock(cache->cache_size); i++) {
- r = dm_cache_set_dirty(cache->cmd, to_cblock(i),
- is_dirty(cache, to_cblock(i)));
- if (r) {
- metadata_operation_failed(cache, "dm_cache_set_dirty", r);
- return r;
- }
- }
+ r = dm_cache_set_dirty_bits(cache->cmd, from_cblock(cache->cache_size), cache->dirty_bitset);
+ if (r)
+ metadata_operation_failed(cache, "dm_cache_set_dirty_bits", r);
- return 0;
+ return r;
}
static int write_discard_bitset(struct cache *cache)
@@ -3269,12 +2937,18 @@ static void cache_postsuspend(struct dm_target *ti)
{
struct cache *cache = ti->private;
- start_quiescing(cache);
- wait_for_migrations(cache);
- stop_worker(cache);
+ prevent_background_work(cache);
+ BUG_ON(atomic_read(&cache->nr_io_migrations));
+
+ cancel_delayed_work(&cache->waker);
+ flush_workqueue(cache->wq);
+ WARN_ON(cache->tracker.in_flight);
+
+ /*
+ * If it's a flush suspend there won't be any deferred bios, so this
+ * call is harmless.
+ */
requeue_deferred_bios(cache);
- requeue_deferred_cells(cache);
- stop_quiescing(cache);
if (get_cache_mode(cache) == CM_WRITE)
(void) sync_metadata(cache);
@@ -3286,15 +2960,16 @@ static int load_mapping(void *context, dm_oblock_t oblock, dm_cblock_t cblock,
int r;
struct cache *cache = context;
- r = policy_load_mapping(cache->policy, oblock, cblock, hint, hint_valid);
+ if (dirty) {
+ set_bit(from_cblock(cblock), cache->dirty_bitset);
+ atomic_inc(&cache->nr_dirty);
+ } else
+ clear_bit(from_cblock(cblock), cache->dirty_bitset);
+
+ r = policy_load_mapping(cache->policy, oblock, cblock, dirty, hint, hint_valid);
if (r)
return r;
- if (dirty)
- set_dirty(cache, oblock, cblock);
- else
- clear_dirty(cache, oblock, cblock);
-
return 0;
}
@@ -3493,6 +3168,7 @@ static void cache_resume(struct dm_target *ti)
struct cache *cache = ti->private;
cache->need_tick_bio = true;
+ allow_background_work(cache);
do_waker(&cache->waker.work);
}
@@ -3547,11 +3223,11 @@ static void cache_status(struct dm_target *ti, status_type_t type,
residency = policy_residency(cache->policy);
- DMEMIT("%u %llu/%llu %u %llu/%llu %u %u %u %u %u %u %lu ",
+ DMEMIT("%u %llu/%llu %llu %llu/%llu %u %u %u %u %u %u %lu ",
(unsigned)DM_CACHE_METADATA_BLOCK_SIZE,
(unsigned long long)(nr_blocks_metadata - nr_free_blocks_metadata),
(unsigned long long)nr_blocks_metadata,
- cache->sectors_per_block,
+ (unsigned long long)cache->sectors_per_block,
(unsigned long long) from_cblock(residency),
(unsigned long long) from_cblock(cache->cache_size),
(unsigned) atomic_read(&cache->stats.read_hit),
@@ -3562,14 +3238,19 @@ static void cache_status(struct dm_target *ti, status_type_t type,
(unsigned) atomic_read(&cache->stats.promotion),
(unsigned long) atomic_read(&cache->nr_dirty));
+ if (cache->features.metadata_version == 2)
+ DMEMIT("2 metadata2 ");
+ else
+ DMEMIT("1 ");
+
if (writethrough_mode(&cache->features))
- DMEMIT("1 writethrough ");
+ DMEMIT("writethrough ");
else if (passthrough_mode(&cache->features))
- DMEMIT("1 passthrough ");
+ DMEMIT("passthrough ");
else if (writeback_mode(&cache->features))
- DMEMIT("1 writeback ");
+ DMEMIT("writeback ");
else {
DMERR("%s: internal error: unknown io mode: %d",
@@ -3622,10 +3303,19 @@ err:
}
/*
+ * Defines a range of cblocks, begin to (end - 1) are in the range. end is
+ * the one-past-the-end value.
+ */
+struct cblock_range {
+ dm_cblock_t begin;
+ dm_cblock_t end;
+};
+
+/*
* A cache block range can take two forms:
*
* i) A single cblock, eg. '3456'
- * ii) A begin and end cblock with dots between, eg. 123-234
+ * ii) A begin and end cblock with a dash between, eg. 123-234
*/
static int parse_cblock_range(struct cache *cache, const char *str,
struct cblock_range *result)
@@ -3691,23 +3381,31 @@ static int validate_cblock_range(struct cache *cache, struct cblock_range *range
return 0;
}
+static inline dm_cblock_t cblock_succ(dm_cblock_t b)
+{
+ return to_cblock(from_cblock(b) + 1);
+}
+
static int request_invalidation(struct cache *cache, struct cblock_range *range)
{
- struct invalidation_request req;
+ int r = 0;
- INIT_LIST_HEAD(&req.list);
- req.cblocks = range;
- atomic_set(&req.complete, 0);
- req.err = 0;
- init_waitqueue_head(&req.result_wait);
+ /*
+ * We don't need to do any locking here because we know we're in
+ * passthrough mode. There's is potential for a race between an
+ * invalidation triggered by an io and an invalidation message. This
+ * is harmless, we must not worry if the policy call fails.
+ */
+ while (range->begin != range->end) {
+ r = invalidate_cblock(cache, range->begin);
+ if (r)
+ return r;
- spin_lock(&cache->invalidation_lock);
- list_add(&req.list, &cache->invalidation_requests);
- spin_unlock(&cache->invalidation_lock);
- wake_worker(cache);
+ range->begin = cblock_succ(range->begin);
+ }
- wait_event(req.result_wait, atomic_read(&req.complete));
- return req.err;
+ cache->commit_requested = true;
+ return r;
}
static int process_invalidate_cblocks_message(struct cache *cache, unsigned count,
@@ -3817,7 +3515,7 @@ static void cache_io_hints(struct dm_target *ti, struct queue_limits *limits)
static struct target_type cache_target = {
.name = "cache",
- .version = {1, 9, 0},
+ .version = {2, 0, 0},
.module = THIS_MODULE,
.ctr = cache_ctr,
.dtr = cache_dtr,
diff --git a/drivers/md/dm-core.h b/drivers/md/dm-core.h
index 40ceba1fe8be..24eddbdf2ab4 100644
--- a/drivers/md/dm-core.h
+++ b/drivers/md/dm-core.h
@@ -47,7 +47,7 @@ struct mapped_device {
struct request_queue *queue;
int numa_node_id;
- unsigned type;
+ enum dm_queue_mode type;
/* Protect queue and type against concurrent access. */
struct mutex type_lock;
@@ -58,6 +58,7 @@ struct mapped_device {
struct target_type *immutable_target_type;
struct gendisk *disk;
+ struct dax_device *dax_dev;
char name[16];
void *interface_ptr;
@@ -92,7 +93,6 @@ struct mapped_device {
* io objects are allocated from here.
*/
mempool_t *io_pool;
- mempool_t *rq_pool;
struct bio_set *bs;
@@ -133,6 +133,7 @@ void dm_init_md_queue(struct mapped_device *md);
void dm_init_normal_md_queue(struct mapped_device *md);
int md_in_flight(struct mapped_device *md);
void disable_write_same(struct mapped_device *md);
+void disable_write_zeroes(struct mapped_device *md);
static inline struct completion *dm_get_completion_from_kobject(struct kobject *kobj)
{
@@ -146,4 +147,7 @@ static inline bool dm_message_test_buffer_overflow(char *result, unsigned maxlen
return !maxlen || strlen(result) + 1 >= maxlen;
}
+extern atomic_t dm_global_event_nr;
+extern wait_queue_head_t dm_global_eventq;
+
#endif
diff --git a/drivers/md/dm-crypt.c b/drivers/md/dm-crypt.c
index 8a9f742d8ed7..cdf6b1e12460 100644
--- a/drivers/md/dm-crypt.c
+++ b/drivers/md/dm-crypt.c
@@ -1,8 +1,8 @@
/*
* Copyright (C) 2003 Jana Saout <[email protected]>
* Copyright (C) 2004 Clemens Fruhwirth <[email protected]>
- * Copyright (C) 2006-2015 Red Hat, Inc. All rights reserved.
- * Copyright (C) 2013 Milan Broz <[email protected]>
+ * Copyright (C) 2006-2017 Red Hat, Inc. All rights reserved.
+ * Copyright (C) 2013-2017 Milan Broz <[email protected]>
*
* This file is released under the GPL.
*/
@@ -31,6 +31,9 @@
#include <crypto/md5.h>
#include <crypto/algapi.h>
#include <crypto/skcipher.h>
+#include <crypto/aead.h>
+#include <crypto/authenc.h>
+#include <linux/rtnetlink.h> /* for struct rtattr and RTA macros only */
#include <keys/user-type.h>
#include <linux/device-mapper.h>
@@ -48,7 +51,11 @@ struct convert_context {
struct bvec_iter iter_out;
sector_t cc_sector;
atomic_t cc_pending;
- struct skcipher_request *req;
+ union {
+ struct skcipher_request *req;
+ struct aead_request *req_aead;
+ } r;
+
};
/*
@@ -57,12 +64,14 @@ struct convert_context {
struct dm_crypt_io {
struct crypt_config *cc;
struct bio *base_bio;
+ u8 *integrity_metadata;
+ bool integrity_metadata_from_pool;
struct work_struct work;
struct convert_context ctx;
atomic_t io_pending;
- int error;
+ blk_status_t error;
sector_t sector;
struct rb_node rb_node;
@@ -70,8 +79,8 @@ struct dm_crypt_io {
struct dm_crypt_request {
struct convert_context *ctx;
- struct scatterlist sg_in;
- struct scatterlist sg_out;
+ struct scatterlist sg_in[4];
+ struct scatterlist sg_out[4];
sector_t iv_sector;
};
@@ -118,6 +127,11 @@ struct iv_tcw_private {
enum flags { DM_CRYPT_SUSPENDED, DM_CRYPT_KEY_VALID,
DM_CRYPT_SAME_CPU, DM_CRYPT_NO_OFFLOAD };
+enum cipher_flags {
+ CRYPT_MODE_INTEGRITY_AEAD, /* Use authenticated mode for cihper */
+ CRYPT_IV_LARGE_SECTORS, /* Calculate IV from sector_size, not 512B sectors */
+};
+
/*
* The fields in here must be read only after initialization.
*/
@@ -126,11 +140,14 @@ struct crypt_config {
sector_t start;
/*
- * pool for per bio private data, crypto requests and
- * encryption requeusts/buffer pages
+ * pool for per bio private data, crypto requests,
+ * encryption requeusts/buffer pages and integrity tags
*/
mempool_t *req_pool;
mempool_t *page_pool;
+ mempool_t *tag_pool;
+ unsigned tag_pool_max_sectors;
+
struct bio_set *bs;
struct mutex bio_alloc_lock;
@@ -143,6 +160,7 @@ struct crypt_config {
char *cipher;
char *cipher_string;
+ char *cipher_auth;
char *key_string;
const struct crypt_iv_operations *iv_gen_ops;
@@ -154,11 +172,17 @@ struct crypt_config {
} iv_gen_private;
sector_t iv_offset;
unsigned int iv_size;
+ unsigned short int sector_size;
+ unsigned char sector_shift;
/* ESSIV: struct crypto_cipher *essiv_tfm */
void *iv_private;
- struct crypto_skcipher **tfms;
+ union {
+ struct crypto_skcipher **tfms;
+ struct crypto_aead **tfms_aead;
+ } cipher_tfm;
unsigned tfms_count;
+ unsigned long cipher_flags;
/*
* Layout of each crypto request:
@@ -181,21 +205,36 @@ struct crypt_config {
unsigned int key_size;
unsigned int key_parts; /* independent parts in key buffer */
unsigned int key_extra_size; /* additional keys length */
+ unsigned int key_mac_size; /* MAC key size for authenc(...) */
+
+ unsigned int integrity_tag_size;
+ unsigned int integrity_iv_size;
+ unsigned int on_disk_tag_size;
+
+ u8 *authenc_key; /* space for keys in authenc() format (if used) */
u8 key[0];
};
-#define MIN_IOS 64
+#define MIN_IOS 64
+#define MAX_TAG_SIZE 480
+#define POOL_ENTRY_SIZE 512
static void clone_init(struct dm_crypt_io *, struct bio *);
static void kcryptd_queue_crypt(struct dm_crypt_io *io);
-static u8 *iv_of_dmreq(struct crypt_config *cc, struct dm_crypt_request *dmreq);
+static struct scatterlist *crypt_get_sg_data(struct crypt_config *cc,
+ struct scatterlist *sg);
/*
- * Use this to access cipher attributes that are the same for each CPU.
+ * Use this to access cipher attributes that are independent of the key.
*/
static struct crypto_skcipher *any_tfm(struct crypt_config *cc)
{
- return cc->tfms[0];
+ return cc->cipher_tfm.tfms[0];
+}
+
+static struct crypto_aead *any_tfm_aead(struct crypt_config *cc)
+{
+ return cc->cipher_tfm.tfms_aead[0];
}
/*
@@ -207,6 +246,9 @@ static struct crypto_skcipher *any_tfm(struct crypt_config *cc)
* plain64: the initial vector is the 64-bit little-endian version of the sector
* number, padded with zeros if necessary.
*
+ * plain64be: the initial vector is the 64-bit big-endian version of the sector
+ * number, padded with zeros if necessary.
+ *
* essiv: "encrypted sector|salt initial vector", the sector number is
* encrypted with the bulk cipher using a salt as key. The salt
* should be derived from the bulk cipher's key via hashing.
@@ -263,6 +305,16 @@ static int crypt_iv_plain64_gen(struct crypt_config *cc, u8 *iv,
return 0;
}
+static int crypt_iv_plain64be_gen(struct crypt_config *cc, u8 *iv,
+ struct dm_crypt_request *dmreq)
+{
+ memset(iv, 0, cc->iv_size);
+ /* iv_size is at least of size u64; usually it is 16 bytes */
+ *(__be64 *)&iv[cc->iv_size - sizeof(u64)] = cpu_to_be64(dmreq->iv_sector);
+
+ return 0;
+}
+
/* Initialise ESSIV - compute salt but no local memory allocations */
static int crypt_iv_essiv_init(struct crypt_config *cc)
{
@@ -310,10 +362,11 @@ static int crypt_iv_essiv_wipe(struct crypt_config *cc)
return err;
}
-/* Set up per cpu cipher state */
-static struct crypto_cipher *setup_essiv_cpu(struct crypt_config *cc,
- struct dm_target *ti,
- u8 *salt, unsigned saltsize)
+/* Allocate the cipher for ESSIV */
+static struct crypto_cipher *alloc_essiv_cipher(struct crypt_config *cc,
+ struct dm_target *ti,
+ const u8 *salt,
+ unsigned int saltsize)
{
struct crypto_cipher *essiv_tfm;
int err;
@@ -325,8 +378,7 @@ static struct crypto_cipher *setup_essiv_cpu(struct crypt_config *cc,
return essiv_tfm;
}
- if (crypto_cipher_blocksize(essiv_tfm) !=
- crypto_skcipher_ivsize(any_tfm(cc))) {
+ if (crypto_cipher_blocksize(essiv_tfm) != cc->iv_size) {
ti->error = "Block size of ESSIV cipher does "
"not match IV size of block cipher";
crypto_free_cipher(essiv_tfm);
@@ -393,8 +445,8 @@ static int crypt_iv_essiv_ctr(struct crypt_config *cc, struct dm_target *ti,
cc->iv_gen_private.essiv.salt = salt;
cc->iv_gen_private.essiv.hash_tfm = hash_tfm;
- essiv_tfm = setup_essiv_cpu(cc, ti, salt,
- crypto_ahash_digestsize(hash_tfm));
+ essiv_tfm = alloc_essiv_cipher(cc, ti, salt,
+ crypto_ahash_digestsize(hash_tfm));
if (IS_ERR(essiv_tfm)) {
crypt_iv_essiv_dtr(cc);
return PTR_ERR(essiv_tfm);
@@ -488,6 +540,11 @@ static int crypt_iv_lmk_ctr(struct crypt_config *cc, struct dm_target *ti,
{
struct iv_lmk_private *lmk = &cc->iv_gen_private.lmk;
+ if (cc->sector_size != (1 << SECTOR_SHIFT)) {
+ ti->error = "Unsupported sector size for LMK";
+ return -EINVAL;
+ }
+
lmk->hash_tfm = crypto_alloc_shash("md5", 0, 0);
if (IS_ERR(lmk->hash_tfm)) {
ti->error = "Error initializing LMK hash";
@@ -585,12 +642,14 @@ static int crypt_iv_lmk_one(struct crypt_config *cc, u8 *iv,
static int crypt_iv_lmk_gen(struct crypt_config *cc, u8 *iv,
struct dm_crypt_request *dmreq)
{
+ struct scatterlist *sg;
u8 *src;
int r = 0;
if (bio_data_dir(dmreq->ctx->bio_in) == WRITE) {
- src = kmap_atomic(sg_page(&dmreq->sg_in));
- r = crypt_iv_lmk_one(cc, iv, dmreq, src + dmreq->sg_in.offset);
+ sg = crypt_get_sg_data(cc, dmreq->sg_in);
+ src = kmap_atomic(sg_page(sg));
+ r = crypt_iv_lmk_one(cc, iv, dmreq, src + sg->offset);
kunmap_atomic(src);
} else
memset(iv, 0, cc->iv_size);
@@ -601,18 +660,20 @@ static int crypt_iv_lmk_gen(struct crypt_config *cc, u8 *iv,
static int crypt_iv_lmk_post(struct crypt_config *cc, u8 *iv,
struct dm_crypt_request *dmreq)
{
+ struct scatterlist *sg;
u8 *dst;
int r;
if (bio_data_dir(dmreq->ctx->bio_in) == WRITE)
return 0;
- dst = kmap_atomic(sg_page(&dmreq->sg_out));
- r = crypt_iv_lmk_one(cc, iv, dmreq, dst + dmreq->sg_out.offset);
+ sg = crypt_get_sg_data(cc, dmreq->sg_out);
+ dst = kmap_atomic(sg_page(sg));
+ r = crypt_iv_lmk_one(cc, iv, dmreq, dst + sg->offset);
/* Tweak the first block of plaintext sector */
if (!r)
- crypto_xor(dst + dmreq->sg_out.offset, iv, cc->iv_size);
+ crypto_xor(dst + sg->offset, iv, cc->iv_size);
kunmap_atomic(dst);
return r;
@@ -637,6 +698,11 @@ static int crypt_iv_tcw_ctr(struct crypt_config *cc, struct dm_target *ti,
{
struct iv_tcw_private *tcw = &cc->iv_gen_private.tcw;
+ if (cc->sector_size != (1 << SECTOR_SHIFT)) {
+ ti->error = "Unsupported sector size for TCW";
+ return -EINVAL;
+ }
+
if (cc->key_size <= (cc->iv_size + TCW_WHITENING_SIZE)) {
ti->error = "Wrong key size for TCW";
return -EINVAL;
@@ -724,6 +790,7 @@ out:
static int crypt_iv_tcw_gen(struct crypt_config *cc, u8 *iv,
struct dm_crypt_request *dmreq)
{
+ struct scatterlist *sg;
struct iv_tcw_private *tcw = &cc->iv_gen_private.tcw;
__le64 sector = cpu_to_le64(dmreq->iv_sector);
u8 *src;
@@ -731,8 +798,9 @@ static int crypt_iv_tcw_gen(struct crypt_config *cc, u8 *iv,
/* Remove whitening from ciphertext */
if (bio_data_dir(dmreq->ctx->bio_in) != WRITE) {
- src = kmap_atomic(sg_page(&dmreq->sg_in));
- r = crypt_iv_tcw_whitening(cc, dmreq, src + dmreq->sg_in.offset);
+ sg = crypt_get_sg_data(cc, dmreq->sg_in);
+ src = kmap_atomic(sg_page(sg));
+ r = crypt_iv_tcw_whitening(cc, dmreq, src + sg->offset);
kunmap_atomic(src);
}
@@ -748,6 +816,7 @@ static int crypt_iv_tcw_gen(struct crypt_config *cc, u8 *iv,
static int crypt_iv_tcw_post(struct crypt_config *cc, u8 *iv,
struct dm_crypt_request *dmreq)
{
+ struct scatterlist *sg;
u8 *dst;
int r;
@@ -755,13 +824,22 @@ static int crypt_iv_tcw_post(struct crypt_config *cc, u8 *iv,
return 0;
/* Apply whitening on ciphertext */
- dst = kmap_atomic(sg_page(&dmreq->sg_out));
- r = crypt_iv_tcw_whitening(cc, dmreq, dst + dmreq->sg_out.offset);
+ sg = crypt_get_sg_data(cc, dmreq->sg_out);
+ dst = kmap_atomic(sg_page(sg));
+ r = crypt_iv_tcw_whitening(cc, dmreq, dst + sg->offset);
kunmap_atomic(dst);
return r;
}
+static int crypt_iv_random_gen(struct crypt_config *cc, u8 *iv,
+ struct dm_crypt_request *dmreq)
+{
+ /* Used only for writes, there must be an additional space to store IV */
+ get_random_bytes(iv, cc->iv_size);
+ return 0;
+}
+
static const struct crypt_iv_operations crypt_iv_plain_ops = {
.generator = crypt_iv_plain_gen
};
@@ -770,6 +848,10 @@ static const struct crypt_iv_operations crypt_iv_plain64_ops = {
.generator = crypt_iv_plain64_gen
};
+static const struct crypt_iv_operations crypt_iv_plain64be_ops = {
+ .generator = crypt_iv_plain64be_gen
+};
+
static const struct crypt_iv_operations crypt_iv_essiv_ops = {
.ctr = crypt_iv_essiv_ctr,
.dtr = crypt_iv_essiv_dtr,
@@ -806,6 +888,108 @@ static const struct crypt_iv_operations crypt_iv_tcw_ops = {
.post = crypt_iv_tcw_post
};
+static struct crypt_iv_operations crypt_iv_random_ops = {
+ .generator = crypt_iv_random_gen
+};
+
+/*
+ * Integrity extensions
+ */
+static bool crypt_integrity_aead(struct crypt_config *cc)
+{
+ return test_bit(CRYPT_MODE_INTEGRITY_AEAD, &cc->cipher_flags);
+}
+
+static bool crypt_integrity_hmac(struct crypt_config *cc)
+{
+ return crypt_integrity_aead(cc) && cc->key_mac_size;
+}
+
+/* Get sg containing data */
+static struct scatterlist *crypt_get_sg_data(struct crypt_config *cc,
+ struct scatterlist *sg)
+{
+ if (unlikely(crypt_integrity_aead(cc)))
+ return &sg[2];
+
+ return sg;
+}
+
+static int dm_crypt_integrity_io_alloc(struct dm_crypt_io *io, struct bio *bio)
+{
+ struct bio_integrity_payload *bip;
+ unsigned int tag_len;
+ int ret;
+
+ if (!bio_sectors(bio) || !io->cc->on_disk_tag_size)
+ return 0;
+
+ bip = bio_integrity_alloc(bio, GFP_NOIO, 1);
+ if (IS_ERR(bip))
+ return PTR_ERR(bip);
+
+ tag_len = io->cc->on_disk_tag_size * bio_sectors(bio);
+
+ bip->bip_iter.bi_size = tag_len;
+ bip->bip_iter.bi_sector = io->cc->start + io->sector;
+
+ /* We own the metadata, do not let bio_free to release it */
+ bip->bip_flags &= ~BIP_BLOCK_INTEGRITY;
+
+ ret = bio_integrity_add_page(bio, virt_to_page(io->integrity_metadata),
+ tag_len, offset_in_page(io->integrity_metadata));
+ if (unlikely(ret != tag_len))
+ return -ENOMEM;
+
+ return 0;
+}
+
+static int crypt_integrity_ctr(struct crypt_config *cc, struct dm_target *ti)
+{
+#ifdef CONFIG_BLK_DEV_INTEGRITY
+ struct blk_integrity *bi = blk_get_integrity(cc->dev->bdev->bd_disk);
+
+ /* From now we require underlying device with our integrity profile */
+ if (!bi || strcasecmp(bi->profile->name, "DM-DIF-EXT-TAG")) {
+ ti->error = "Integrity profile not supported.";
+ return -EINVAL;
+ }
+
+ if (bi->tag_size != cc->on_disk_tag_size ||
+ bi->tuple_size != cc->on_disk_tag_size) {
+ ti->error = "Integrity profile tag size mismatch.";
+ return -EINVAL;
+ }
+ if (1 << bi->interval_exp != cc->sector_size) {
+ ti->error = "Integrity profile sector size mismatch.";
+ return -EINVAL;
+ }
+
+ if (crypt_integrity_aead(cc)) {
+ cc->integrity_tag_size = cc->on_disk_tag_size - cc->integrity_iv_size;
+ DMINFO("Integrity AEAD, tag size %u, IV size %u.",
+ cc->integrity_tag_size, cc->integrity_iv_size);
+
+ if (crypto_aead_setauthsize(any_tfm_aead(cc), cc->integrity_tag_size)) {
+ ti->error = "Integrity AEAD auth tag size is not supported.";
+ return -EINVAL;
+ }
+ } else if (cc->integrity_iv_size)
+ DMINFO("Additional per-sector space %u bytes for IV.",
+ cc->integrity_iv_size);
+
+ if ((cc->integrity_tag_size + cc->integrity_iv_size) != bi->tag_size) {
+ ti->error = "Not enough space for integrity tag in the profile.";
+ return -EINVAL;
+ }
+
+ return 0;
+#else
+ ti->error = "Integrity profile not supported.";
+ return -EINVAL;
+#endif
+}
+
static void crypt_convert_init(struct crypt_config *cc,
struct convert_context *ctx,
struct bio *bio_out, struct bio *bio_in,
@@ -822,58 +1006,217 @@ static void crypt_convert_init(struct crypt_config *cc,
}
static struct dm_crypt_request *dmreq_of_req(struct crypt_config *cc,
- struct skcipher_request *req)
+ void *req)
{
return (struct dm_crypt_request *)((char *)req + cc->dmreq_start);
}
-static struct skcipher_request *req_of_dmreq(struct crypt_config *cc,
- struct dm_crypt_request *dmreq)
+static void *req_of_dmreq(struct crypt_config *cc, struct dm_crypt_request *dmreq)
{
- return (struct skcipher_request *)((char *)dmreq - cc->dmreq_start);
+ return (void *)((char *)dmreq - cc->dmreq_start);
}
static u8 *iv_of_dmreq(struct crypt_config *cc,
struct dm_crypt_request *dmreq)
{
- return (u8 *)ALIGN((unsigned long)(dmreq + 1),
- crypto_skcipher_alignmask(any_tfm(cc)) + 1);
+ if (crypt_integrity_aead(cc))
+ return (u8 *)ALIGN((unsigned long)(dmreq + 1),
+ crypto_aead_alignmask(any_tfm_aead(cc)) + 1);
+ else
+ return (u8 *)ALIGN((unsigned long)(dmreq + 1),
+ crypto_skcipher_alignmask(any_tfm(cc)) + 1);
}
-static int crypt_convert_block(struct crypt_config *cc,
- struct convert_context *ctx,
- struct skcipher_request *req)
+static u8 *org_iv_of_dmreq(struct crypt_config *cc,
+ struct dm_crypt_request *dmreq)
+{
+ return iv_of_dmreq(cc, dmreq) + cc->iv_size;
+}
+
+static uint64_t *org_sector_of_dmreq(struct crypt_config *cc,
+ struct dm_crypt_request *dmreq)
+{
+ u8 *ptr = iv_of_dmreq(cc, dmreq) + cc->iv_size + cc->iv_size;
+ return (uint64_t*) ptr;
+}
+
+static unsigned int *org_tag_of_dmreq(struct crypt_config *cc,
+ struct dm_crypt_request *dmreq)
+{
+ u8 *ptr = iv_of_dmreq(cc, dmreq) + cc->iv_size +
+ cc->iv_size + sizeof(uint64_t);
+ return (unsigned int*)ptr;
+}
+
+static void *tag_from_dmreq(struct crypt_config *cc,
+ struct dm_crypt_request *dmreq)
+{
+ struct convert_context *ctx = dmreq->ctx;
+ struct dm_crypt_io *io = container_of(ctx, struct dm_crypt_io, ctx);
+
+ return &io->integrity_metadata[*org_tag_of_dmreq(cc, dmreq) *
+ cc->on_disk_tag_size];
+}
+
+static void *iv_tag_from_dmreq(struct crypt_config *cc,
+ struct dm_crypt_request *dmreq)
+{
+ return tag_from_dmreq(cc, dmreq) + cc->integrity_tag_size;
+}
+
+static int crypt_convert_block_aead(struct crypt_config *cc,
+ struct convert_context *ctx,
+ struct aead_request *req,
+ unsigned int tag_offset)
{
struct bio_vec bv_in = bio_iter_iovec(ctx->bio_in, ctx->iter_in);
struct bio_vec bv_out = bio_iter_iovec(ctx->bio_out, ctx->iter_out);
struct dm_crypt_request *dmreq;
- u8 *iv;
- int r;
+ u8 *iv, *org_iv, *tag_iv, *tag;
+ uint64_t *sector;
+ int r = 0;
+
+ BUG_ON(cc->integrity_iv_size && cc->integrity_iv_size != cc->iv_size);
+
+ /* Reject unexpected unaligned bio. */
+ if (unlikely(bv_in.bv_offset & (cc->sector_size - 1)))
+ return -EIO;
dmreq = dmreq_of_req(cc, req);
+ dmreq->iv_sector = ctx->cc_sector;
+ if (test_bit(CRYPT_IV_LARGE_SECTORS, &cc->cipher_flags))
+ dmreq->iv_sector >>= cc->sector_shift;
+ dmreq->ctx = ctx;
+
+ *org_tag_of_dmreq(cc, dmreq) = tag_offset;
+
+ sector = org_sector_of_dmreq(cc, dmreq);
+ *sector = cpu_to_le64(ctx->cc_sector - cc->iv_offset);
+
iv = iv_of_dmreq(cc, dmreq);
+ org_iv = org_iv_of_dmreq(cc, dmreq);
+ tag = tag_from_dmreq(cc, dmreq);
+ tag_iv = iv_tag_from_dmreq(cc, dmreq);
+
+ /* AEAD request:
+ * |----- AAD -------|------ DATA -------|-- AUTH TAG --|
+ * | (authenticated) | (auth+encryption) | |
+ * | sector_LE | IV | sector in/out | tag in/out |
+ */
+ sg_init_table(dmreq->sg_in, 4);
+ sg_set_buf(&dmreq->sg_in[0], sector, sizeof(uint64_t));
+ sg_set_buf(&dmreq->sg_in[1], org_iv, cc->iv_size);
+ sg_set_page(&dmreq->sg_in[2], bv_in.bv_page, cc->sector_size, bv_in.bv_offset);
+ sg_set_buf(&dmreq->sg_in[3], tag, cc->integrity_tag_size);
+
+ sg_init_table(dmreq->sg_out, 4);
+ sg_set_buf(&dmreq->sg_out[0], sector, sizeof(uint64_t));
+ sg_set_buf(&dmreq->sg_out[1], org_iv, cc->iv_size);
+ sg_set_page(&dmreq->sg_out[2], bv_out.bv_page, cc->sector_size, bv_out.bv_offset);
+ sg_set_buf(&dmreq->sg_out[3], tag, cc->integrity_tag_size);
+
+ if (cc->iv_gen_ops) {
+ /* For READs use IV stored in integrity metadata */
+ if (cc->integrity_iv_size && bio_data_dir(ctx->bio_in) != WRITE) {
+ memcpy(org_iv, tag_iv, cc->iv_size);
+ } else {
+ r = cc->iv_gen_ops->generator(cc, org_iv, dmreq);
+ if (r < 0)
+ return r;
+ /* Store generated IV in integrity metadata */
+ if (cc->integrity_iv_size)
+ memcpy(tag_iv, org_iv, cc->iv_size);
+ }
+ /* Working copy of IV, to be modified in crypto API */
+ memcpy(iv, org_iv, cc->iv_size);
+ }
+
+ aead_request_set_ad(req, sizeof(uint64_t) + cc->iv_size);
+ if (bio_data_dir(ctx->bio_in) == WRITE) {
+ aead_request_set_crypt(req, dmreq->sg_in, dmreq->sg_out,
+ cc->sector_size, iv);
+ r = crypto_aead_encrypt(req);
+ if (cc->integrity_tag_size + cc->integrity_iv_size != cc->on_disk_tag_size)
+ memset(tag + cc->integrity_tag_size + cc->integrity_iv_size, 0,
+ cc->on_disk_tag_size - (cc->integrity_tag_size + cc->integrity_iv_size));
+ } else {
+ aead_request_set_crypt(req, dmreq->sg_in, dmreq->sg_out,
+ cc->sector_size + cc->integrity_tag_size, iv);
+ r = crypto_aead_decrypt(req);
+ }
+
+ if (r == -EBADMSG)
+ DMERR_LIMIT("INTEGRITY AEAD ERROR, sector %llu",
+ (unsigned long long)le64_to_cpu(*sector));
+
+ if (!r && cc->iv_gen_ops && cc->iv_gen_ops->post)
+ r = cc->iv_gen_ops->post(cc, org_iv, dmreq);
+
+ bio_advance_iter(ctx->bio_in, &ctx->iter_in, cc->sector_size);
+ bio_advance_iter(ctx->bio_out, &ctx->iter_out, cc->sector_size);
+
+ return r;
+}
+
+static int crypt_convert_block_skcipher(struct crypt_config *cc,
+ struct convert_context *ctx,
+ struct skcipher_request *req,
+ unsigned int tag_offset)
+{
+ struct bio_vec bv_in = bio_iter_iovec(ctx->bio_in, ctx->iter_in);
+ struct bio_vec bv_out = bio_iter_iovec(ctx->bio_out, ctx->iter_out);
+ struct scatterlist *sg_in, *sg_out;
+ struct dm_crypt_request *dmreq;
+ u8 *iv, *org_iv, *tag_iv;
+ uint64_t *sector;
+ int r = 0;
+ /* Reject unexpected unaligned bio. */
+ if (unlikely(bv_in.bv_offset & (cc->sector_size - 1)))
+ return -EIO;
+
+ dmreq = dmreq_of_req(cc, req);
dmreq->iv_sector = ctx->cc_sector;
+ if (test_bit(CRYPT_IV_LARGE_SECTORS, &cc->cipher_flags))
+ dmreq->iv_sector >>= cc->sector_shift;
dmreq->ctx = ctx;
- sg_init_table(&dmreq->sg_in, 1);
- sg_set_page(&dmreq->sg_in, bv_in.bv_page, 1 << SECTOR_SHIFT,
- bv_in.bv_offset);
- sg_init_table(&dmreq->sg_out, 1);
- sg_set_page(&dmreq->sg_out, bv_out.bv_page, 1 << SECTOR_SHIFT,
- bv_out.bv_offset);
+ *org_tag_of_dmreq(cc, dmreq) = tag_offset;
+
+ iv = iv_of_dmreq(cc, dmreq);
+ org_iv = org_iv_of_dmreq(cc, dmreq);
+ tag_iv = iv_tag_from_dmreq(cc, dmreq);
+
+ sector = org_sector_of_dmreq(cc, dmreq);
+ *sector = cpu_to_le64(ctx->cc_sector - cc->iv_offset);
+
+ /* For skcipher we use only the first sg item */
+ sg_in = &dmreq->sg_in[0];
+ sg_out = &dmreq->sg_out[0];
- bio_advance_iter(ctx->bio_in, &ctx->iter_in, 1 << SECTOR_SHIFT);
- bio_advance_iter(ctx->bio_out, &ctx->iter_out, 1 << SECTOR_SHIFT);
+ sg_init_table(sg_in, 1);
+ sg_set_page(sg_in, bv_in.bv_page, cc->sector_size, bv_in.bv_offset);
+
+ sg_init_table(sg_out, 1);
+ sg_set_page(sg_out, bv_out.bv_page, cc->sector_size, bv_out.bv_offset);
if (cc->iv_gen_ops) {
- r = cc->iv_gen_ops->generator(cc, iv, dmreq);
- if (r < 0)
- return r;
+ /* For READs use IV stored in integrity metadata */
+ if (cc->integrity_iv_size && bio_data_dir(ctx->bio_in) != WRITE) {
+ memcpy(org_iv, tag_iv, cc->integrity_iv_size);
+ } else {
+ r = cc->iv_gen_ops->generator(cc, org_iv, dmreq);
+ if (r < 0)
+ return r;
+ /* Store generated IV in integrity metadata */
+ if (cc->integrity_iv_size)
+ memcpy(tag_iv, org_iv, cc->integrity_iv_size);
+ }
+ /* Working copy of IV, to be modified in crypto API */
+ memcpy(iv, org_iv, cc->iv_size);
}
- skcipher_request_set_crypt(req, &dmreq->sg_in, &dmreq->sg_out,
- 1 << SECTOR_SHIFT, iv);
+ skcipher_request_set_crypt(req, sg_in, sg_out, cc->sector_size, iv);
if (bio_data_dir(ctx->bio_in) == WRITE)
r = crypto_skcipher_encrypt(req);
@@ -881,7 +1224,10 @@ static int crypt_convert_block(struct crypt_config *cc,
r = crypto_skcipher_decrypt(req);
if (!r && cc->iv_gen_ops && cc->iv_gen_ops->post)
- r = cc->iv_gen_ops->post(cc, iv, dmreq);
+ r = cc->iv_gen_ops->post(cc, org_iv, dmreq);
+
+ bio_advance_iter(ctx->bio_in, &ctx->iter_in, cc->sector_size);
+ bio_advance_iter(ctx->bio_out, &ctx->iter_out, cc->sector_size);
return r;
}
@@ -889,27 +1235,53 @@ static int crypt_convert_block(struct crypt_config *cc,
static void kcryptd_async_done(struct crypto_async_request *async_req,
int error);
-static void crypt_alloc_req(struct crypt_config *cc,
- struct convert_context *ctx)
+static void crypt_alloc_req_skcipher(struct crypt_config *cc,
+ struct convert_context *ctx)
{
unsigned key_index = ctx->cc_sector & (cc->tfms_count - 1);
- if (!ctx->req)
- ctx->req = mempool_alloc(cc->req_pool, GFP_NOIO);
+ if (!ctx->r.req)
+ ctx->r.req = mempool_alloc(cc->req_pool, GFP_NOIO);
+
+ skcipher_request_set_tfm(ctx->r.req, cc->cipher_tfm.tfms[key_index]);
+
+ /*
+ * Use REQ_MAY_BACKLOG so a cipher driver internally backlogs
+ * requests if driver request queue is full.
+ */
+ skcipher_request_set_callback(ctx->r.req,
+ CRYPTO_TFM_REQ_MAY_BACKLOG | CRYPTO_TFM_REQ_MAY_SLEEP,
+ kcryptd_async_done, dmreq_of_req(cc, ctx->r.req));
+}
+
+static void crypt_alloc_req_aead(struct crypt_config *cc,
+ struct convert_context *ctx)
+{
+ if (!ctx->r.req_aead)
+ ctx->r.req_aead = mempool_alloc(cc->req_pool, GFP_NOIO);
- skcipher_request_set_tfm(ctx->req, cc->tfms[key_index]);
+ aead_request_set_tfm(ctx->r.req_aead, cc->cipher_tfm.tfms_aead[0]);
/*
* Use REQ_MAY_BACKLOG so a cipher driver internally backlogs
* requests if driver request queue is full.
*/
- skcipher_request_set_callback(ctx->req,
+ aead_request_set_callback(ctx->r.req_aead,
CRYPTO_TFM_REQ_MAY_BACKLOG | CRYPTO_TFM_REQ_MAY_SLEEP,
- kcryptd_async_done, dmreq_of_req(cc, ctx->req));
+ kcryptd_async_done, dmreq_of_req(cc, ctx->r.req_aead));
+}
+
+static void crypt_alloc_req(struct crypt_config *cc,
+ struct convert_context *ctx)
+{
+ if (crypt_integrity_aead(cc))
+ crypt_alloc_req_aead(cc, ctx);
+ else
+ crypt_alloc_req_skcipher(cc, ctx);
}
-static void crypt_free_req(struct crypt_config *cc,
- struct skcipher_request *req, struct bio *base_bio)
+static void crypt_free_req_skcipher(struct crypt_config *cc,
+ struct skcipher_request *req, struct bio *base_bio)
{
struct dm_crypt_io *io = dm_per_bio_data(base_bio, cc->per_bio_data_size);
@@ -917,12 +1289,31 @@ static void crypt_free_req(struct crypt_config *cc,
mempool_free(req, cc->req_pool);
}
+static void crypt_free_req_aead(struct crypt_config *cc,
+ struct aead_request *req, struct bio *base_bio)
+{
+ struct dm_crypt_io *io = dm_per_bio_data(base_bio, cc->per_bio_data_size);
+
+ if ((struct aead_request *)(io + 1) != req)
+ mempool_free(req, cc->req_pool);
+}
+
+static void crypt_free_req(struct crypt_config *cc, void *req, struct bio *base_bio)
+{
+ if (crypt_integrity_aead(cc))
+ crypt_free_req_aead(cc, req, base_bio);
+ else
+ crypt_free_req_skcipher(cc, req, base_bio);
+}
+
/*
* Encrypt / decrypt data from one bio to another one (can be the same one)
*/
-static int crypt_convert(struct crypt_config *cc,
+static blk_status_t crypt_convert(struct crypt_config *cc,
struct convert_context *ctx)
{
+ unsigned int tag_offset = 0;
+ unsigned int sector_step = cc->sector_size >> SECTOR_SHIFT;
int r;
atomic_set(&ctx->cc_pending, 1);
@@ -930,10 +1321,12 @@ static int crypt_convert(struct crypt_config *cc,
while (ctx->iter_in.bi_size && ctx->iter_out.bi_size) {
crypt_alloc_req(cc, ctx);
-
atomic_inc(&ctx->cc_pending);
- r = crypt_convert_block(cc, ctx, ctx->req);
+ if (crypt_integrity_aead(cc))
+ r = crypt_convert_block_aead(cc, ctx, ctx->r.req_aead, tag_offset);
+ else
+ r = crypt_convert_block_skcipher(cc, ctx, ctx->r.req, tag_offset);
switch (r) {
/*
@@ -949,22 +1342,31 @@ static int crypt_convert(struct crypt_config *cc,
* completion function kcryptd_async_done() will be called.
*/
case -EINPROGRESS:
- ctx->req = NULL;
- ctx->cc_sector++;
+ ctx->r.req = NULL;
+ ctx->cc_sector += sector_step;
+ tag_offset++;
continue;
/*
* The request was already processed (synchronously).
*/
case 0:
atomic_dec(&ctx->cc_pending);
- ctx->cc_sector++;
+ ctx->cc_sector += sector_step;
+ tag_offset++;
cond_resched();
continue;
-
- /* There was an error while processing the request. */
+ /*
+ * There was a data integrity error.
+ */
+ case -EBADMSG:
+ atomic_dec(&ctx->cc_pending);
+ return BLK_STS_PROTECTION;
+ /*
+ * There was an error while processing the request.
+ */
default:
atomic_dec(&ctx->cc_pending);
- return r;
+ return BLK_STS_IOERR;
}
}
@@ -1005,7 +1407,7 @@ retry:
clone = bio_alloc_bioset(GFP_NOIO, nr_iovecs, cc->bs);
if (!clone)
- goto return_clone;
+ goto out;
clone_init(io, clone);
@@ -1027,7 +1429,13 @@ retry:
remaining_size -= len;
}
-return_clone:
+ /* Allocate space for integrity tags */
+ if (dm_crypt_integrity_io_alloc(io, clone)) {
+ crypt_free_buffer_pages(cc, clone);
+ bio_put(clone);
+ clone = NULL;
+ }
+out:
if (unlikely(gfp_mask & __GFP_DIRECT_RECLAIM))
mutex_unlock(&cc->bio_alloc_lock);
@@ -1053,7 +1461,9 @@ static void crypt_io_init(struct dm_crypt_io *io, struct crypt_config *cc,
io->base_bio = bio;
io->sector = sector;
io->error = 0;
- io->ctx.req = NULL;
+ io->ctx.r.req = NULL;
+ io->integrity_metadata = NULL;
+ io->integrity_metadata_from_pool = false;
atomic_set(&io->io_pending, 0);
}
@@ -1070,15 +1480,20 @@ static void crypt_dec_pending(struct dm_crypt_io *io)
{
struct crypt_config *cc = io->cc;
struct bio *base_bio = io->base_bio;
- int error = io->error;
+ blk_status_t error = io->error;
if (!atomic_dec_and_test(&io->io_pending))
return;
- if (io->ctx.req)
- crypt_free_req(cc, io->ctx.req, base_bio);
+ if (io->ctx.r.req)
+ crypt_free_req(cc, io->ctx.r.req, base_bio);
- base_bio->bi_error = error;
+ if (unlikely(io->integrity_metadata_from_pool))
+ mempool_free(io->integrity_metadata, io->cc->tag_pool);
+ else
+ kfree(io->integrity_metadata);
+
+ base_bio->bi_status = error;
bio_endio(base_bio);
}
@@ -1104,7 +1519,7 @@ static void crypt_endio(struct bio *clone)
struct dm_crypt_io *io = clone->bi_private;
struct crypt_config *cc = io->cc;
unsigned rw = bio_data_dir(clone);
- int error;
+ blk_status_t error;
/*
* free the processed pages
@@ -1112,7 +1527,7 @@ static void crypt_endio(struct bio *clone)
if (rw == WRITE)
crypt_free_buffer_pages(cc, clone);
- error = clone->bi_error;
+ error = clone->bi_status;
bio_put(clone);
if (rw == READ && !error) {
@@ -1156,6 +1571,12 @@ static int kcryptd_io_read(struct dm_crypt_io *io, gfp_t gfp)
clone_init(io, clone);
clone->bi_iter.bi_sector = cc->start + io->sector;
+ if (dm_crypt_integrity_io_alloc(io, clone)) {
+ crypt_dec_pending(io);
+ bio_put(clone);
+ return 1;
+ }
+
generic_make_request(clone);
return 0;
}
@@ -1166,7 +1587,7 @@ static void kcryptd_io_read_work(struct work_struct *work)
crypt_inc_pending(io);
if (kcryptd_io_read(io, GFP_NOIO))
- io->error = -ENOMEM;
+ io->error = BLK_STS_RESOURCE;
crypt_dec_pending(io);
}
@@ -1210,14 +1631,14 @@ continue_locked:
spin_unlock_irq(&cc->write_thread_wait.lock);
if (unlikely(kthread_should_stop())) {
- set_task_state(current, TASK_RUNNING);
+ set_current_state(TASK_RUNNING);
remove_wait_queue(&cc->write_thread_wait, &wait);
break;
}
schedule();
- set_task_state(current, TASK_RUNNING);
+ set_current_state(TASK_RUNNING);
spin_lock_irq(&cc->write_thread_wait.lock);
__remove_wait_queue(&cc->write_thread_wait, &wait);
goto continue_locked;
@@ -1252,7 +1673,7 @@ static void kcryptd_crypt_write_io_submit(struct dm_crypt_io *io, int async)
sector_t sector;
struct rb_node **rbp, *parent;
- if (unlikely(io->error < 0)) {
+ if (unlikely(io->error)) {
crypt_free_buffer_pages(cc, clone);
bio_put(clone);
crypt_dec_pending(io);
@@ -1293,7 +1714,7 @@ static void kcryptd_crypt_write_convert(struct dm_crypt_io *io)
struct bio *clone;
int crypt_finished;
sector_t sector = io->sector;
- int r;
+ blk_status_t r;
/*
* Prevent io from disappearing until this function completes.
@@ -1303,7 +1724,7 @@ static void kcryptd_crypt_write_convert(struct dm_crypt_io *io)
clone = crypt_alloc_buffer(io, io->base_bio->bi_iter.bi_size);
if (unlikely(!clone)) {
- io->error = -EIO;
+ io->error = BLK_STS_IOERR;
goto dec;
}
@@ -1315,7 +1736,7 @@ static void kcryptd_crypt_write_convert(struct dm_crypt_io *io)
crypt_inc_pending(io);
r = crypt_convert(cc, &io->ctx);
if (r)
- io->error = -EIO;
+ io->error = r;
crypt_finished = atomic_dec_and_test(&io->ctx.cc_pending);
/* Encryption was already finished, submit io now */
@@ -1336,7 +1757,7 @@ static void kcryptd_crypt_read_done(struct dm_crypt_io *io)
static void kcryptd_crypt_read_convert(struct dm_crypt_io *io)
{
struct crypt_config *cc = io->cc;
- int r = 0;
+ blk_status_t r;
crypt_inc_pending(io);
@@ -1344,8 +1765,8 @@ static void kcryptd_crypt_read_convert(struct dm_crypt_io *io)
io->sector);
r = crypt_convert(cc, &io->ctx);
- if (r < 0)
- io->error = -EIO;
+ if (r)
+ io->error = r;
if (atomic_dec_and_test(&io->ctx.cc_pending))
kcryptd_crypt_read_done(io);
@@ -1372,10 +1793,14 @@ static void kcryptd_async_done(struct crypto_async_request *async_req,
}
if (!error && cc->iv_gen_ops && cc->iv_gen_ops->post)
- error = cc->iv_gen_ops->post(cc, iv_of_dmreq(cc, dmreq), dmreq);
+ error = cc->iv_gen_ops->post(cc, org_iv_of_dmreq(cc, dmreq), dmreq);
- if (error < 0)
- io->error = -EIO;
+ if (error == -EBADMSG) {
+ DMERR_LIMIT("INTEGRITY AEAD ERROR, sector %llu",
+ (unsigned long long)le64_to_cpu(*org_sector_of_dmreq(cc, dmreq)));
+ io->error = BLK_STS_PROTECTION;
+ } else if (error < 0)
+ io->error = BLK_STS_IOERR;
crypt_free_req(cc, req_of_dmreq(cc, dmreq), io->base_bio);
@@ -1406,61 +1831,59 @@ static void kcryptd_queue_crypt(struct dm_crypt_io *io)
queue_work(cc->crypt_queue, &io->work);
}
-/*
- * Decode key from its hex representation
- */
-static int crypt_decode_key(u8 *key, char *hex, unsigned int size)
+static void crypt_free_tfms_aead(struct crypt_config *cc)
{
- char buffer[3];
- unsigned int i;
-
- buffer[2] = '\0';
-
- for (i = 0; i < size; i++) {
- buffer[0] = *hex++;
- buffer[1] = *hex++;
+ if (!cc->cipher_tfm.tfms_aead)
+ return;
- if (kstrtou8(buffer, 16, &key[i]))
- return -EINVAL;
+ if (cc->cipher_tfm.tfms_aead[0] && !IS_ERR(cc->cipher_tfm.tfms_aead[0])) {
+ crypto_free_aead(cc->cipher_tfm.tfms_aead[0]);
+ cc->cipher_tfm.tfms_aead[0] = NULL;
}
- if (*hex != '\0')
- return -EINVAL;
-
- return 0;
+ kfree(cc->cipher_tfm.tfms_aead);
+ cc->cipher_tfm.tfms_aead = NULL;
}
-static void crypt_free_tfms(struct crypt_config *cc)
+static void crypt_free_tfms_skcipher(struct crypt_config *cc)
{
unsigned i;
- if (!cc->tfms)
+ if (!cc->cipher_tfm.tfms)
return;
for (i = 0; i < cc->tfms_count; i++)
- if (cc->tfms[i] && !IS_ERR(cc->tfms[i])) {
- crypto_free_skcipher(cc->tfms[i]);
- cc->tfms[i] = NULL;
+ if (cc->cipher_tfm.tfms[i] && !IS_ERR(cc->cipher_tfm.tfms[i])) {
+ crypto_free_skcipher(cc->cipher_tfm.tfms[i]);
+ cc->cipher_tfm.tfms[i] = NULL;
}
- kfree(cc->tfms);
- cc->tfms = NULL;
+ kfree(cc->cipher_tfm.tfms);
+ cc->cipher_tfm.tfms = NULL;
}
-static int crypt_alloc_tfms(struct crypt_config *cc, char *ciphermode)
+static void crypt_free_tfms(struct crypt_config *cc)
+{
+ if (crypt_integrity_aead(cc))
+ crypt_free_tfms_aead(cc);
+ else
+ crypt_free_tfms_skcipher(cc);
+}
+
+static int crypt_alloc_tfms_skcipher(struct crypt_config *cc, char *ciphermode)
{
unsigned i;
int err;
- cc->tfms = kzalloc(cc->tfms_count * sizeof(struct crypto_skcipher *),
- GFP_KERNEL);
- if (!cc->tfms)
+ cc->cipher_tfm.tfms = kzalloc(cc->tfms_count *
+ sizeof(struct crypto_skcipher *), GFP_KERNEL);
+ if (!cc->cipher_tfm.tfms)
return -ENOMEM;
for (i = 0; i < cc->tfms_count; i++) {
- cc->tfms[i] = crypto_alloc_skcipher(ciphermode, 0, 0);
- if (IS_ERR(cc->tfms[i])) {
- err = PTR_ERR(cc->tfms[i]);
+ cc->cipher_tfm.tfms[i] = crypto_alloc_skcipher(ciphermode, 0, 0);
+ if (IS_ERR(cc->cipher_tfm.tfms[i])) {
+ err = PTR_ERR(cc->cipher_tfm.tfms[i]);
crypt_free_tfms(cc);
return err;
}
@@ -1469,22 +1892,95 @@ static int crypt_alloc_tfms(struct crypt_config *cc, char *ciphermode)
return 0;
}
+static int crypt_alloc_tfms_aead(struct crypt_config *cc, char *ciphermode)
+{
+ int err;
+
+ cc->cipher_tfm.tfms = kmalloc(sizeof(struct crypto_aead *), GFP_KERNEL);
+ if (!cc->cipher_tfm.tfms)
+ return -ENOMEM;
+
+ cc->cipher_tfm.tfms_aead[0] = crypto_alloc_aead(ciphermode, 0, 0);
+ if (IS_ERR(cc->cipher_tfm.tfms_aead[0])) {
+ err = PTR_ERR(cc->cipher_tfm.tfms_aead[0]);
+ crypt_free_tfms(cc);
+ return err;
+ }
+
+ return 0;
+}
+
+static int crypt_alloc_tfms(struct crypt_config *cc, char *ciphermode)
+{
+ if (crypt_integrity_aead(cc))
+ return crypt_alloc_tfms_aead(cc, ciphermode);
+ else
+ return crypt_alloc_tfms_skcipher(cc, ciphermode);
+}
+
+static unsigned crypt_subkey_size(struct crypt_config *cc)
+{
+ return (cc->key_size - cc->key_extra_size) >> ilog2(cc->tfms_count);
+}
+
+static unsigned crypt_authenckey_size(struct crypt_config *cc)
+{
+ return crypt_subkey_size(cc) + RTA_SPACE(sizeof(struct crypto_authenc_key_param));
+}
+
+/*
+ * If AEAD is composed like authenc(hmac(sha256),xts(aes)),
+ * the key must be for some reason in special format.
+ * This funcion converts cc->key to this special format.
+ */
+static void crypt_copy_authenckey(char *p, const void *key,
+ unsigned enckeylen, unsigned authkeylen)
+{
+ struct crypto_authenc_key_param *param;
+ struct rtattr *rta;
+
+ rta = (struct rtattr *)p;
+ param = RTA_DATA(rta);
+ param->enckeylen = cpu_to_be32(enckeylen);
+ rta->rta_len = RTA_LENGTH(sizeof(*param));
+ rta->rta_type = CRYPTO_AUTHENC_KEYA_PARAM;
+ p += RTA_SPACE(sizeof(*param));
+ memcpy(p, key + enckeylen, authkeylen);
+ p += authkeylen;
+ memcpy(p, key, enckeylen);
+}
+
static int crypt_setkey(struct crypt_config *cc)
{
unsigned subkey_size;
int err = 0, i, r;
/* Ignore extra keys (which are used for IV etc) */
- subkey_size = (cc->key_size - cc->key_extra_size) >> ilog2(cc->tfms_count);
+ subkey_size = crypt_subkey_size(cc);
+ if (crypt_integrity_hmac(cc))
+ crypt_copy_authenckey(cc->authenc_key, cc->key,
+ subkey_size - cc->key_mac_size,
+ cc->key_mac_size);
for (i = 0; i < cc->tfms_count; i++) {
- r = crypto_skcipher_setkey(cc->tfms[i],
- cc->key + (i * subkey_size),
- subkey_size);
+ if (crypt_integrity_hmac(cc))
+ r = crypto_aead_setkey(cc->cipher_tfm.tfms_aead[i],
+ cc->authenc_key, crypt_authenckey_size(cc));
+ else if (crypt_integrity_aead(cc))
+ r = crypto_aead_setkey(cc->cipher_tfm.tfms_aead[i],
+ cc->key + (i * subkey_size),
+ subkey_size);
+ else
+ r = crypto_skcipher_setkey(cc->cipher_tfm.tfms[i],
+ cc->key + (i * subkey_size),
+ subkey_size);
if (r)
err = r;
}
+ if (crypt_integrity_hmac(cc))
+ memzero_explicit(cc->authenc_key, crypt_authenckey_size(cc));
+
return err;
}
@@ -1536,7 +2032,7 @@ static int crypt_set_keyring_key(struct crypt_config *cc, const char *key_string
down_read(&key->sem);
- ukp = user_key_payload(key);
+ ukp = user_key_payload_locked(key);
if (!ukp) {
up_read(&key->sem);
key_put(key);
@@ -1633,7 +2129,8 @@ static int crypt_set_key(struct crypt_config *cc, char *key)
kzfree(cc->key_string);
cc->key_string = NULL;
- if (cc->key_size && crypt_decode_key(cc->key, key, cc->key_size) < 0)
+ /* Decode key from its hex representation. */
+ if (cc->key_size && hex2bin(cc->key, key, cc->key_size) < 0)
goto out;
r = crypt_setkey(cc);
@@ -1649,12 +2146,16 @@ out:
static int crypt_wipe_key(struct crypt_config *cc)
{
+ int r;
+
clear_bit(DM_CRYPT_KEY_VALID, &cc->flags);
- memset(&cc->key, 0, cc->key_size * sizeof(u8));
+ get_random_bytes(&cc->key, cc->key_size);
kzfree(cc->key_string);
cc->key_string = NULL;
+ r = crypt_setkey(cc);
+ memset(&cc->key, 0, cc->key_size * sizeof(u8));
- return crypt_setkey(cc);
+ return r;
}
static void crypt_dtr(struct dm_target *ti)
@@ -1681,6 +2182,7 @@ static void crypt_dtr(struct dm_target *ti)
mempool_destroy(cc->page_pool);
mempool_destroy(cc->req_pool);
+ mempool_destroy(cc->tag_pool);
if (cc->iv_gen_ops && cc->iv_gen_ops->dtr)
cc->iv_gen_ops->dtr(cc);
@@ -1691,30 +2193,223 @@ static void crypt_dtr(struct dm_target *ti)
kzfree(cc->cipher);
kzfree(cc->cipher_string);
kzfree(cc->key_string);
+ kzfree(cc->cipher_auth);
+ kzfree(cc->authenc_key);
/* Must zero key material before freeing */
kzfree(cc);
}
-static int crypt_ctr_cipher(struct dm_target *ti,
- char *cipher_in, char *key)
+static int crypt_ctr_ivmode(struct dm_target *ti, const char *ivmode)
+{
+ struct crypt_config *cc = ti->private;
+
+ if (crypt_integrity_aead(cc))
+ cc->iv_size = crypto_aead_ivsize(any_tfm_aead(cc));
+ else
+ cc->iv_size = crypto_skcipher_ivsize(any_tfm(cc));
+
+ if (cc->iv_size)
+ /* at least a 64 bit sector number should fit in our buffer */
+ cc->iv_size = max(cc->iv_size,
+ (unsigned int)(sizeof(u64) / sizeof(u8)));
+ else if (ivmode) {
+ DMWARN("Selected cipher does not support IVs");
+ ivmode = NULL;
+ }
+
+ /* Choose ivmode, see comments at iv code. */
+ if (ivmode == NULL)
+ cc->iv_gen_ops = NULL;
+ else if (strcmp(ivmode, "plain") == 0)
+ cc->iv_gen_ops = &crypt_iv_plain_ops;
+ else if (strcmp(ivmode, "plain64") == 0)
+ cc->iv_gen_ops = &crypt_iv_plain64_ops;
+ else if (strcmp(ivmode, "plain64be") == 0)
+ cc->iv_gen_ops = &crypt_iv_plain64be_ops;
+ else if (strcmp(ivmode, "essiv") == 0)
+ cc->iv_gen_ops = &crypt_iv_essiv_ops;
+ else if (strcmp(ivmode, "benbi") == 0)
+ cc->iv_gen_ops = &crypt_iv_benbi_ops;
+ else if (strcmp(ivmode, "null") == 0)
+ cc->iv_gen_ops = &crypt_iv_null_ops;
+ else if (strcmp(ivmode, "lmk") == 0) {
+ cc->iv_gen_ops = &crypt_iv_lmk_ops;
+ /*
+ * Version 2 and 3 is recognised according
+ * to length of provided multi-key string.
+ * If present (version 3), last key is used as IV seed.
+ * All keys (including IV seed) are always the same size.
+ */
+ if (cc->key_size % cc->key_parts) {
+ cc->key_parts++;
+ cc->key_extra_size = cc->key_size / cc->key_parts;
+ }
+ } else if (strcmp(ivmode, "tcw") == 0) {
+ cc->iv_gen_ops = &crypt_iv_tcw_ops;
+ cc->key_parts += 2; /* IV + whitening */
+ cc->key_extra_size = cc->iv_size + TCW_WHITENING_SIZE;
+ } else if (strcmp(ivmode, "random") == 0) {
+ cc->iv_gen_ops = &crypt_iv_random_ops;
+ /* Need storage space in integrity fields. */
+ cc->integrity_iv_size = cc->iv_size;
+ } else {
+ ti->error = "Invalid IV mode";
+ return -EINVAL;
+ }
+
+ return 0;
+}
+
+/*
+ * Workaround to parse cipher algorithm from crypto API spec.
+ * The cc->cipher is currently used only in ESSIV.
+ * This should be probably done by crypto-api calls (once available...)
+ */
+static int crypt_ctr_blkdev_cipher(struct crypt_config *cc)
+{
+ const char *alg_name = NULL;
+ char *start, *end;
+
+ if (crypt_integrity_aead(cc)) {
+ alg_name = crypto_tfm_alg_name(crypto_aead_tfm(any_tfm_aead(cc)));
+ if (!alg_name)
+ return -EINVAL;
+ if (crypt_integrity_hmac(cc)) {
+ alg_name = strchr(alg_name, ',');
+ if (!alg_name)
+ return -EINVAL;
+ }
+ alg_name++;
+ } else {
+ alg_name = crypto_tfm_alg_name(crypto_skcipher_tfm(any_tfm(cc)));
+ if (!alg_name)
+ return -EINVAL;
+ }
+
+ start = strchr(alg_name, '(');
+ end = strchr(alg_name, ')');
+
+ if (!start && !end) {
+ cc->cipher = kstrdup(alg_name, GFP_KERNEL);
+ return cc->cipher ? 0 : -ENOMEM;
+ }
+
+ if (!start || !end || ++start >= end)
+ return -EINVAL;
+
+ cc->cipher = kzalloc(end - start + 1, GFP_KERNEL);
+ if (!cc->cipher)
+ return -ENOMEM;
+
+ strncpy(cc->cipher, start, end - start);
+
+ return 0;
+}
+
+/*
+ * Workaround to parse HMAC algorithm from AEAD crypto API spec.
+ * The HMAC is needed to calculate tag size (HMAC digest size).
+ * This should be probably done by crypto-api calls (once available...)
+ */
+static int crypt_ctr_auth_cipher(struct crypt_config *cc, char *cipher_api)
+{
+ char *start, *end, *mac_alg = NULL;
+ struct crypto_ahash *mac;
+
+ if (!strstarts(cipher_api, "authenc("))
+ return 0;
+
+ start = strchr(cipher_api, '(');
+ end = strchr(cipher_api, ',');
+ if (!start || !end || ++start > end)
+ return -EINVAL;
+
+ mac_alg = kzalloc(end - start + 1, GFP_KERNEL);
+ if (!mac_alg)
+ return -ENOMEM;
+ strncpy(mac_alg, start, end - start);
+
+ mac = crypto_alloc_ahash(mac_alg, 0, 0);
+ kfree(mac_alg);
+
+ if (IS_ERR(mac))
+ return PTR_ERR(mac);
+
+ cc->key_mac_size = crypto_ahash_digestsize(mac);
+ crypto_free_ahash(mac);
+
+ cc->authenc_key = kmalloc(crypt_authenckey_size(cc), GFP_KERNEL);
+ if (!cc->authenc_key)
+ return -ENOMEM;
+
+ return 0;
+}
+
+static int crypt_ctr_cipher_new(struct dm_target *ti, char *cipher_in, char *key,
+ char **ivmode, char **ivopts)
{
struct crypt_config *cc = ti->private;
- char *tmp, *cipher, *chainmode, *ivmode, *ivopts, *keycount;
+ char *tmp, *cipher_api;
+ int ret = -EINVAL;
+
+ cc->tfms_count = 1;
+
+ /*
+ * New format (capi: prefix)
+ * capi:cipher_api_spec-iv:ivopts
+ */
+ tmp = &cipher_in[strlen("capi:")];
+ cipher_api = strsep(&tmp, "-");
+ *ivmode = strsep(&tmp, ":");
+ *ivopts = tmp;
+
+ if (*ivmode && !strcmp(*ivmode, "lmk"))
+ cc->tfms_count = 64;
+
+ cc->key_parts = cc->tfms_count;
+
+ /* Allocate cipher */
+ ret = crypt_alloc_tfms(cc, cipher_api);
+ if (ret < 0) {
+ ti->error = "Error allocating crypto tfm";
+ return ret;
+ }
+
+ /* Alloc AEAD, can be used only in new format. */
+ if (crypt_integrity_aead(cc)) {
+ ret = crypt_ctr_auth_cipher(cc, cipher_api);
+ if (ret < 0) {
+ ti->error = "Invalid AEAD cipher spec";
+ return -ENOMEM;
+ }
+ cc->iv_size = crypto_aead_ivsize(any_tfm_aead(cc));
+ } else
+ cc->iv_size = crypto_skcipher_ivsize(any_tfm(cc));
+
+ ret = crypt_ctr_blkdev_cipher(cc);
+ if (ret < 0) {
+ ti->error = "Cannot allocate cipher string";
+ return -ENOMEM;
+ }
+
+ return 0;
+}
+
+static int crypt_ctr_cipher_old(struct dm_target *ti, char *cipher_in, char *key,
+ char **ivmode, char **ivopts)
+{
+ struct crypt_config *cc = ti->private;
+ char *tmp, *cipher, *chainmode, *keycount;
char *cipher_api = NULL;
int ret = -EINVAL;
char dummy;
- /* Convert to crypto api definition? */
- if (strchr(cipher_in, '(')) {
+ if (strchr(cipher_in, '(') || crypt_integrity_aead(cc)) {
ti->error = "Bad cipher specification";
return -EINVAL;
}
- cc->cipher_string = kstrdup(cipher_in, GFP_KERNEL);
- if (!cc->cipher_string)
- goto bad_mem;
-
/*
* Legacy dm-crypt cipher specification
* cipher[:keycount]-mode-iv:ivopts
@@ -1731,15 +2426,14 @@ static int crypt_ctr_cipher(struct dm_target *ti,
return -EINVAL;
}
cc->key_parts = cc->tfms_count;
- cc->key_extra_size = 0;
cc->cipher = kstrdup(cipher, GFP_KERNEL);
if (!cc->cipher)
goto bad_mem;
chainmode = strsep(&tmp, "-");
- ivopts = strsep(&tmp, "-");
- ivmode = strsep(&ivopts, ":");
+ *ivopts = strsep(&tmp, "-");
+ *ivmode = strsep(&*ivopts, ":");
if (tmp)
DMWARN("Ignoring unexpected additional cipher options");
@@ -1748,12 +2442,12 @@ static int crypt_ctr_cipher(struct dm_target *ti,
* For compatibility with the original dm-crypt mapping format, if
* only the cipher name is supplied, use cbc-plain.
*/
- if (!chainmode || (!strcmp(chainmode, "plain") && !ivmode)) {
+ if (!chainmode || (!strcmp(chainmode, "plain") && !*ivmode)) {
chainmode = "cbc";
- ivmode = "plain";
+ *ivmode = "plain";
}
- if (strcmp(chainmode, "ecb") && !ivmode) {
+ if (strcmp(chainmode, "ecb") && !*ivmode) {
ti->error = "IV mechanism required";
return -EINVAL;
}
@@ -1773,60 +2467,45 @@ static int crypt_ctr_cipher(struct dm_target *ti,
ret = crypt_alloc_tfms(cc, cipher_api);
if (ret < 0) {
ti->error = "Error allocating crypto tfm";
- goto bad;
+ kfree(cipher_api);
+ return ret;
}
- /* Initialize IV */
- cc->iv_size = crypto_skcipher_ivsize(any_tfm(cc));
- if (cc->iv_size)
- /* at least a 64 bit sector number should fit in our buffer */
- cc->iv_size = max(cc->iv_size,
- (unsigned int)(sizeof(u64) / sizeof(u8)));
- else if (ivmode) {
- DMWARN("Selected cipher does not support IVs");
- ivmode = NULL;
- }
+ return 0;
+bad_mem:
+ ti->error = "Cannot allocate cipher strings";
+ return -ENOMEM;
+}
- /* Choose ivmode, see comments at iv code. */
- if (ivmode == NULL)
- cc->iv_gen_ops = NULL;
- else if (strcmp(ivmode, "plain") == 0)
- cc->iv_gen_ops = &crypt_iv_plain_ops;
- else if (strcmp(ivmode, "plain64") == 0)
- cc->iv_gen_ops = &crypt_iv_plain64_ops;
- else if (strcmp(ivmode, "essiv") == 0)
- cc->iv_gen_ops = &crypt_iv_essiv_ops;
- else if (strcmp(ivmode, "benbi") == 0)
- cc->iv_gen_ops = &crypt_iv_benbi_ops;
- else if (strcmp(ivmode, "null") == 0)
- cc->iv_gen_ops = &crypt_iv_null_ops;
- else if (strcmp(ivmode, "lmk") == 0) {
- cc->iv_gen_ops = &crypt_iv_lmk_ops;
- /*
- * Version 2 and 3 is recognised according
- * to length of provided multi-key string.
- * If present (version 3), last key is used as IV seed.
- * All keys (including IV seed) are always the same size.
- */
- if (cc->key_size % cc->key_parts) {
- cc->key_parts++;
- cc->key_extra_size = cc->key_size / cc->key_parts;
- }
- } else if (strcmp(ivmode, "tcw") == 0) {
- cc->iv_gen_ops = &crypt_iv_tcw_ops;
- cc->key_parts += 2; /* IV + whitening */
- cc->key_extra_size = cc->iv_size + TCW_WHITENING_SIZE;
- } else {
- ret = -EINVAL;
- ti->error = "Invalid IV mode";
- goto bad;
+static int crypt_ctr_cipher(struct dm_target *ti, char *cipher_in, char *key)
+{
+ struct crypt_config *cc = ti->private;
+ char *ivmode = NULL, *ivopts = NULL;
+ int ret;
+
+ cc->cipher_string = kstrdup(cipher_in, GFP_KERNEL);
+ if (!cc->cipher_string) {
+ ti->error = "Cannot allocate cipher strings";
+ return -ENOMEM;
}
+ if (strstarts(cipher_in, "capi:"))
+ ret = crypt_ctr_cipher_new(ti, cipher_in, key, &ivmode, &ivopts);
+ else
+ ret = crypt_ctr_cipher_old(ti, cipher_in, key, &ivmode, &ivopts);
+ if (ret)
+ return ret;
+
+ /* Initialize IV */
+ ret = crypt_ctr_ivmode(ti, ivmode);
+ if (ret < 0)
+ return ret;
+
/* Initialize and set key */
ret = crypt_set_key(cc, key);
if (ret < 0) {
ti->error = "Error decoding and setting key";
- goto bad;
+ return ret;
}
/* Allocate IV */
@@ -1834,7 +2513,7 @@ static int crypt_ctr_cipher(struct dm_target *ti,
ret = cc->iv_gen_ops->ctr(cc, ti, ivopts);
if (ret < 0) {
ti->error = "Error creating IV";
- goto bad;
+ return ret;
}
}
@@ -1843,18 +2522,82 @@ static int crypt_ctr_cipher(struct dm_target *ti,
ret = cc->iv_gen_ops->init(cc);
if (ret < 0) {
ti->error = "Error initialising IV";
- goto bad;
+ return ret;
}
}
- ret = 0;
-bad:
- kfree(cipher_api);
return ret;
+}
-bad_mem:
- ti->error = "Cannot allocate cipher strings";
- return -ENOMEM;
+static int crypt_ctr_optional(struct dm_target *ti, unsigned int argc, char **argv)
+{
+ struct crypt_config *cc = ti->private;
+ struct dm_arg_set as;
+ static struct dm_arg _args[] = {
+ {0, 6, "Invalid number of feature args"},
+ };
+ unsigned int opt_params, val;
+ const char *opt_string, *sval;
+ char dummy;
+ int ret;
+
+ /* Optional parameters */
+ as.argc = argc;
+ as.argv = argv;
+
+ ret = dm_read_arg_group(_args, &as, &opt_params, &ti->error);
+ if (ret)
+ return ret;
+
+ while (opt_params--) {
+ opt_string = dm_shift_arg(&as);
+ if (!opt_string) {
+ ti->error = "Not enough feature arguments";
+ return -EINVAL;
+ }
+
+ if (!strcasecmp(opt_string, "allow_discards"))
+ ti->num_discard_bios = 1;
+
+ else if (!strcasecmp(opt_string, "same_cpu_crypt"))
+ set_bit(DM_CRYPT_SAME_CPU, &cc->flags);
+
+ else if (!strcasecmp(opt_string, "submit_from_crypt_cpus"))
+ set_bit(DM_CRYPT_NO_OFFLOAD, &cc->flags);
+ else if (sscanf(opt_string, "integrity:%u:", &val) == 1) {
+ if (val == 0 || val > MAX_TAG_SIZE) {
+ ti->error = "Invalid integrity arguments";
+ return -EINVAL;
+ }
+ cc->on_disk_tag_size = val;
+ sval = strchr(opt_string + strlen("integrity:"), ':') + 1;
+ if (!strcasecmp(sval, "aead")) {
+ set_bit(CRYPT_MODE_INTEGRITY_AEAD, &cc->cipher_flags);
+ } else if (strcasecmp(sval, "none")) {
+ ti->error = "Unknown integrity profile";
+ return -EINVAL;
+ }
+
+ cc->cipher_auth = kstrdup(sval, GFP_KERNEL);
+ if (!cc->cipher_auth)
+ return -ENOMEM;
+ } else if (sscanf(opt_string, "sector_size:%hu%c", &cc->sector_size, &dummy) == 1) {
+ if (cc->sector_size < (1 << SECTOR_SHIFT) ||
+ cc->sector_size > 4096 ||
+ (cc->sector_size & (cc->sector_size - 1))) {
+ ti->error = "Invalid feature value for sector_size";
+ return -EINVAL;
+ }
+ cc->sector_shift = __ffs(cc->sector_size) - SECTOR_SHIFT;
+ } else if (!strcasecmp(opt_string, "iv_large_sectors"))
+ set_bit(CRYPT_IV_LARGE_SECTORS, &cc->cipher_flags);
+ else {
+ ti->error = "Invalid feature arguments";
+ return -EINVAL;
+ }
+ }
+
+ return 0;
}
/*
@@ -1865,18 +2608,12 @@ static int crypt_ctr(struct dm_target *ti, unsigned int argc, char **argv)
{
struct crypt_config *cc;
int key_size;
- unsigned int opt_params;
+ unsigned int align_mask;
unsigned long long tmpll;
int ret;
- size_t iv_size_padding;
- struct dm_arg_set as;
- const char *opt_string;
+ size_t iv_size_padding, additional_req_size;
char dummy;
- static struct dm_arg _args[] = {
- {0, 3, "Invalid number of feature args"},
- };
-
if (argc < 5) {
ti->error = "Not enough arguments";
return -EINVAL;
@@ -1894,40 +2631,63 @@ static int crypt_ctr(struct dm_target *ti, unsigned int argc, char **argv)
return -ENOMEM;
}
cc->key_size = key_size;
+ cc->sector_size = (1 << SECTOR_SHIFT);
+ cc->sector_shift = 0;
ti->private = cc;
+
+ /* Optional parameters need to be read before cipher constructor */
+ if (argc > 5) {
+ ret = crypt_ctr_optional(ti, argc - 5, &argv[5]);
+ if (ret)
+ goto bad;
+ }
+
ret = crypt_ctr_cipher(ti, argv[0], argv[1]);
if (ret < 0)
goto bad;
- cc->dmreq_start = sizeof(struct skcipher_request);
- cc->dmreq_start += crypto_skcipher_reqsize(any_tfm(cc));
+ if (crypt_integrity_aead(cc)) {
+ cc->dmreq_start = sizeof(struct aead_request);
+ cc->dmreq_start += crypto_aead_reqsize(any_tfm_aead(cc));
+ align_mask = crypto_aead_alignmask(any_tfm_aead(cc));
+ } else {
+ cc->dmreq_start = sizeof(struct skcipher_request);
+ cc->dmreq_start += crypto_skcipher_reqsize(any_tfm(cc));
+ align_mask = crypto_skcipher_alignmask(any_tfm(cc));
+ }
cc->dmreq_start = ALIGN(cc->dmreq_start, __alignof__(struct dm_crypt_request));
- if (crypto_skcipher_alignmask(any_tfm(cc)) < CRYPTO_MINALIGN) {
+ if (align_mask < CRYPTO_MINALIGN) {
/* Allocate the padding exactly */
iv_size_padding = -(cc->dmreq_start + sizeof(struct dm_crypt_request))
- & crypto_skcipher_alignmask(any_tfm(cc));
+ & align_mask;
} else {
/*
* If the cipher requires greater alignment than kmalloc
* alignment, we don't know the exact position of the
* initialization vector. We must assume worst case.
*/
- iv_size_padding = crypto_skcipher_alignmask(any_tfm(cc));
+ iv_size_padding = align_mask;
}
ret = -ENOMEM;
- cc->req_pool = mempool_create_kmalloc_pool(MIN_IOS, cc->dmreq_start +
- sizeof(struct dm_crypt_request) + iv_size_padding + cc->iv_size);
+
+ /* ...| IV + padding | original IV | original sec. number | bio tag offset | */
+ additional_req_size = sizeof(struct dm_crypt_request) +
+ iv_size_padding + cc->iv_size +
+ cc->iv_size +
+ sizeof(uint64_t) +
+ sizeof(unsigned int);
+
+ cc->req_pool = mempool_create_kmalloc_pool(MIN_IOS, cc->dmreq_start + additional_req_size);
if (!cc->req_pool) {
ti->error = "Cannot allocate crypt request mempool";
goto bad;
}
cc->per_bio_data_size = ti->per_io_data_size =
- ALIGN(sizeof(struct dm_crypt_io) + cc->dmreq_start +
- sizeof(struct dm_crypt_request) + iv_size_padding + cc->iv_size,
+ ALIGN(sizeof(struct dm_crypt_io) + cc->dmreq_start + additional_req_size,
ARCH_KMALLOC_MINALIGN);
cc->page_pool = mempool_create_page_pool(BIO_MAX_PAGES, 0);
@@ -1936,7 +2696,8 @@ static int crypt_ctr(struct dm_target *ti, unsigned int argc, char **argv)
goto bad;
}
- cc->bs = bioset_create(MIN_IOS, 0);
+ cc->bs = bioset_create(MIN_IOS, 0, (BIOSET_NEED_BVECS |
+ BIOSET_NEED_RESCUER));
if (!cc->bs) {
ti->error = "Cannot allocate crypt bioset";
goto bad;
@@ -1945,7 +2706,8 @@ static int crypt_ctr(struct dm_target *ti, unsigned int argc, char **argv)
mutex_init(&cc->bio_alloc_lock);
ret = -EINVAL;
- if (sscanf(argv[2], "%llu%c", &tmpll, &dummy) != 1) {
+ if ((sscanf(argv[2], "%llu%c", &tmpll, &dummy) != 1) ||
+ (tmpll & ((cc->sector_size >> SECTOR_SHIFT) - 1))) {
ti->error = "Invalid iv_offset sector";
goto bad;
}
@@ -1964,53 +2726,37 @@ static int crypt_ctr(struct dm_target *ti, unsigned int argc, char **argv)
}
cc->start = tmpll;
- argv += 5;
- argc -= 5;
-
- /* Optional parameters */
- if (argc) {
- as.argc = argc;
- as.argv = argv;
-
- ret = dm_read_arg_group(_args, &as, &opt_params, &ti->error);
+ if (crypt_integrity_aead(cc) || cc->integrity_iv_size) {
+ ret = crypt_integrity_ctr(cc, ti);
if (ret)
goto bad;
- ret = -EINVAL;
- while (opt_params--) {
- opt_string = dm_shift_arg(&as);
- if (!opt_string) {
- ti->error = "Not enough feature arguments";
- goto bad;
- }
-
- if (!strcasecmp(opt_string, "allow_discards"))
- ti->num_discard_bios = 1;
+ cc->tag_pool_max_sectors = POOL_ENTRY_SIZE / cc->on_disk_tag_size;
+ if (!cc->tag_pool_max_sectors)
+ cc->tag_pool_max_sectors = 1;
- else if (!strcasecmp(opt_string, "same_cpu_crypt"))
- set_bit(DM_CRYPT_SAME_CPU, &cc->flags);
-
- else if (!strcasecmp(opt_string, "submit_from_crypt_cpus"))
- set_bit(DM_CRYPT_NO_OFFLOAD, &cc->flags);
-
- else {
- ti->error = "Invalid feature arguments";
- goto bad;
- }
+ cc->tag_pool = mempool_create_kmalloc_pool(MIN_IOS,
+ cc->tag_pool_max_sectors * cc->on_disk_tag_size);
+ if (!cc->tag_pool) {
+ ti->error = "Cannot allocate integrity tags mempool";
+ goto bad;
}
+
+ cc->tag_pool_max_sectors <<= cc->sector_shift;
}
ret = -ENOMEM;
- cc->io_queue = alloc_workqueue("kcryptd_io", WQ_MEM_RECLAIM, 1);
+ cc->io_queue = alloc_workqueue("kcryptd_io", WQ_HIGHPRI | WQ_CPU_INTENSIVE | WQ_MEM_RECLAIM, 1);
if (!cc->io_queue) {
ti->error = "Couldn't create kcryptd io queue";
goto bad;
}
if (test_bit(DM_CRYPT_SAME_CPU, &cc->flags))
- cc->crypt_queue = alloc_workqueue("kcryptd", WQ_CPU_INTENSIVE | WQ_MEM_RECLAIM, 1);
+ cc->crypt_queue = alloc_workqueue("kcryptd", WQ_HIGHPRI | WQ_CPU_INTENSIVE | WQ_MEM_RECLAIM, 1);
else
- cc->crypt_queue = alloc_workqueue("kcryptd", WQ_CPU_INTENSIVE | WQ_MEM_RECLAIM | WQ_UNBOUND,
+ cc->crypt_queue = alloc_workqueue("kcryptd",
+ WQ_HIGHPRI | WQ_CPU_INTENSIVE | WQ_MEM_RECLAIM | WQ_UNBOUND,
num_online_cpus());
if (!cc->crypt_queue) {
ti->error = "Couldn't create kcryptd queue";
@@ -2030,7 +2776,6 @@ static int crypt_ctr(struct dm_target *ti, unsigned int argc, char **argv)
wake_up_process(cc->write_thread);
ti->num_flush_bios = 1;
- ti->discard_zeroes_data_unsupported = true;
return 0;
@@ -2062,12 +2807,39 @@ static int crypt_map(struct dm_target *ti, struct bio *bio)
* Check if bio is too large, split as needed.
*/
if (unlikely(bio->bi_iter.bi_size > (BIO_MAX_PAGES << PAGE_SHIFT)) &&
- bio_data_dir(bio) == WRITE)
+ (bio_data_dir(bio) == WRITE || cc->on_disk_tag_size))
dm_accept_partial_bio(bio, ((BIO_MAX_PAGES << PAGE_SHIFT) >> SECTOR_SHIFT));
+ /*
+ * Ensure that bio is a multiple of internal sector encryption size
+ * and is aligned to this size as defined in IO hints.
+ */
+ if (unlikely((bio->bi_iter.bi_sector & ((cc->sector_size >> SECTOR_SHIFT) - 1)) != 0))
+ return DM_MAPIO_KILL;
+
+ if (unlikely(bio->bi_iter.bi_size & (cc->sector_size - 1)))
+ return DM_MAPIO_KILL;
+
io = dm_per_bio_data(bio, cc->per_bio_data_size);
crypt_io_init(io, cc, bio, dm_target_offset(ti, bio->bi_iter.bi_sector));
- io->ctx.req = (struct skcipher_request *)(io + 1);
+
+ if (cc->on_disk_tag_size) {
+ unsigned tag_len = cc->on_disk_tag_size * (bio_sectors(bio) >> cc->sector_shift);
+
+ if (unlikely(tag_len > KMALLOC_MAX_SIZE) ||
+ unlikely(!(io->integrity_metadata = kmalloc(tag_len,
+ GFP_NOIO | __GFP_NORETRY | __GFP_NOMEMALLOC | __GFP_NOWARN)))) {
+ if (bio_sectors(bio) > cc->tag_pool_max_sectors)
+ dm_accept_partial_bio(bio, cc->tag_pool_max_sectors);
+ io->integrity_metadata = mempool_alloc(cc->tag_pool, GFP_NOIO);
+ io->integrity_metadata_from_pool = true;
+ }
+ }
+
+ if (crypt_integrity_aead(cc))
+ io->ctx.r.req_aead = (struct aead_request *)(io + 1);
+ else
+ io->ctx.r.req = (struct skcipher_request *)(io + 1);
if (bio_data_dir(io->base_bio) == READ) {
if (kcryptd_io_read(io, GFP_NOWAIT))
@@ -2108,6 +2880,10 @@ static void crypt_status(struct dm_target *ti, status_type_t type,
num_feature_args += !!ti->num_discard_bios;
num_feature_args += test_bit(DM_CRYPT_SAME_CPU, &cc->flags);
num_feature_args += test_bit(DM_CRYPT_NO_OFFLOAD, &cc->flags);
+ num_feature_args += cc->sector_size != (1 << SECTOR_SHIFT);
+ num_feature_args += test_bit(CRYPT_IV_LARGE_SECTORS, &cc->cipher_flags);
+ if (cc->on_disk_tag_size)
+ num_feature_args++;
if (num_feature_args) {
DMEMIT(" %d", num_feature_args);
if (ti->num_discard_bios)
@@ -2116,6 +2892,12 @@ static void crypt_status(struct dm_target *ti, status_type_t type,
DMEMIT(" same_cpu_crypt");
if (test_bit(DM_CRYPT_NO_OFFLOAD, &cc->flags))
DMEMIT(" submit_from_crypt_cpus");
+ if (cc->on_disk_tag_size)
+ DMEMIT(" integrity:%u:%s", cc->on_disk_tag_size, cc->cipher_auth);
+ if (cc->sector_size != (1 << SECTOR_SHIFT))
+ DMEMIT(" sector_size:%d", cc->sector_size);
+ if (test_bit(CRYPT_IV_LARGE_SECTORS, &cc->cipher_flags))
+ DMEMIT(" iv_large_sectors");
}
break;
@@ -2205,6 +2987,8 @@ static int crypt_iterate_devices(struct dm_target *ti,
static void crypt_io_hints(struct dm_target *ti, struct queue_limits *limits)
{
+ struct crypt_config *cc = ti->private;
+
/*
* Unfortunate constraint that is required to avoid the potential
* for exceeding underlying device's max_segments limits -- due to
@@ -2212,11 +2996,17 @@ static void crypt_io_hints(struct dm_target *ti, struct queue_limits *limits)
* bio that are not as physically contiguous as the original bio.
*/
limits->max_segment_size = PAGE_SIZE;
+
+ if (cc->sector_size != (1 << SECTOR_SHIFT)) {
+ limits->logical_block_size = cc->sector_size;
+ limits->physical_block_size = cc->sector_size;
+ blk_limits_io_min(limits, cc->sector_size);
+ }
}
static struct target_type crypt_target = {
.name = "crypt",
- .version = {1, 15, 0},
+ .version = {1, 18, 0},
.module = THIS_MODULE,
.ctr = crypt_ctr,
.dtr = crypt_dtr,
diff --git a/drivers/md/dm-delay.c b/drivers/md/dm-delay.c
index cc70871a6d29..ae3158795d26 100644
--- a/drivers/md/dm-delay.c
+++ b/drivers/md/dm-delay.c
@@ -340,6 +340,7 @@ out:
static struct target_type delay_target = {
.name = "delay",
.version = {1, 2, 1},
+ .features = DM_TARGET_PASSES_INTEGRITY,
.module = THIS_MODULE,
.ctr = delay_ctr,
.dtr = delay_dtr,
diff --git a/drivers/md/dm-era-target.c b/drivers/md/dm-era-target.c
index bf2b2676cb8a..e7ba89f98d8d 100644
--- a/drivers/md/dm-era-target.c
+++ b/drivers/md/dm-era-target.c
@@ -254,7 +254,6 @@ static struct dm_block_validator sb_validator = {
* Low level metadata handling
*--------------------------------------------------------------*/
#define DM_ERA_METADATA_BLOCK_SIZE 4096
-#define DM_ERA_METADATA_CACHE_SIZE 64
#define ERA_MAX_CONCURRENT_LOCKS 5
struct era_metadata {
@@ -615,7 +614,6 @@ static int create_persistent_data_objects(struct era_metadata *md,
int r;
md->bm = dm_block_manager_create(md->bdev, DM_ERA_METADATA_BLOCK_SIZE,
- DM_ERA_METADATA_CACHE_SIZE,
ERA_MAX_CONCURRENT_LOCKS);
if (IS_ERR(md->bm)) {
DMERR("could not create block manager");
@@ -961,15 +959,15 @@ static int metadata_commit(struct era_metadata *md)
}
}
- r = save_sm_root(md);
+ r = dm_tm_pre_commit(md->tm);
if (r) {
- DMERR("%s: save_sm_root failed", __func__);
+ DMERR("%s: pre commit failed", __func__);
return r;
}
- r = dm_tm_pre_commit(md->tm);
+ r = save_sm_root(md);
if (r) {
- DMERR("%s: pre commit failed", __func__);
+ DMERR("%s: save_sm_root failed", __func__);
return r;
}
@@ -1379,7 +1377,7 @@ static void stop_worker(struct era *era)
static int dev_is_congested(struct dm_dev *dev, int bdi_bits)
{
struct request_queue *q = bdev_get_queue(dev->bdev);
- return bdi_congested(&q->backing_dev_info, bdi_bits);
+ return bdi_congested(q->backing_dev_info, bdi_bits);
}
static int era_is_congested(struct dm_target_callbacks *cb, int bdi_bits)
diff --git a/drivers/md/dm-flakey.c b/drivers/md/dm-flakey.c
index 13305a182611..e2c7234931bc 100644
--- a/drivers/md/dm-flakey.c
+++ b/drivers/md/dm-flakey.c
@@ -275,7 +275,7 @@ static void flakey_map_bio(struct dm_target *ti, struct bio *bio)
struct flakey_c *fc = ti->private;
bio->bi_bdev = fc->dev->bdev;
- if (bio_sectors(bio))
+ if (bio_sectors(bio) || bio_op(bio) == REQ_OP_ZONE_RESET)
bio->bi_iter.bi_sector =
flakey_map_sector(ti, bio->bi_iter.bi_sector);
}
@@ -306,6 +306,14 @@ static int flakey_map(struct dm_target *ti, struct bio *bio)
struct per_bio_data *pb = dm_per_bio_data(bio, sizeof(struct per_bio_data));
pb->bio_submitted = false;
+ /* Do not fail reset zone */
+ if (bio_op(bio) == REQ_OP_ZONE_RESET)
+ goto map_bio;
+
+ /* We need to remap reported zones, so remember the BIO iter */
+ if (bio_op(bio) == REQ_OP_ZONE_REPORT)
+ goto map_bio;
+
/* Are we alive ? */
elapsed = (jiffies - fc->start_time) / HZ;
if (elapsed % (fc->up_interval + fc->down_interval) >= fc->up_interval) {
@@ -321,7 +329,7 @@ static int flakey_map(struct dm_target *ti, struct bio *bio)
if (bio_data_dir(bio) == READ) {
if (!fc->corrupt_bio_byte && !test_bit(DROP_WRITES, &fc->flags) &&
!test_bit(ERROR_WRITES, &fc->flags))
- return -EIO;
+ return DM_MAPIO_KILL;
goto map_bio;
}
@@ -349,7 +357,7 @@ static int flakey_map(struct dm_target *ti, struct bio *bio)
/*
* By default, error all I/O.
*/
- return -EIO;
+ return DM_MAPIO_KILL;
}
map_bio:
@@ -358,12 +366,21 @@ map_bio:
return DM_MAPIO_REMAPPED;
}
-static int flakey_end_io(struct dm_target *ti, struct bio *bio, int error)
+static int flakey_end_io(struct dm_target *ti, struct bio *bio,
+ blk_status_t *error)
{
struct flakey_c *fc = ti->private;
struct per_bio_data *pb = dm_per_bio_data(bio, sizeof(struct per_bio_data));
- if (!error && pb->bio_submitted && (bio_data_dir(bio) == READ)) {
+ if (bio_op(bio) == REQ_OP_ZONE_RESET)
+ return DM_ENDIO_DONE;
+
+ if (bio_op(bio) == REQ_OP_ZONE_REPORT) {
+ dm_remap_zone_report(ti, bio, fc->start);
+ return DM_ENDIO_DONE;
+ }
+
+ if (!*error && pb->bio_submitted && (bio_data_dir(bio) == READ)) {
if (fc->corrupt_bio_byte && (fc->corrupt_bio_rw == READ) &&
all_corrupt_bio_flags_match(bio, fc)) {
/*
@@ -377,11 +394,11 @@ static int flakey_end_io(struct dm_target *ti, struct bio *bio, int error)
* Error read during the down_interval if drop_writes
* and error_writes were not configured.
*/
- return -EIO;
+ *error = BLK_STS_IOERR;
}
}
- return error;
+ return DM_ENDIO_DONE;
}
static void flakey_status(struct dm_target *ti, status_type_t type,
@@ -445,7 +462,8 @@ static int flakey_iterate_devices(struct dm_target *ti, iterate_devices_callout_
static struct target_type flakey_target = {
.name = "flakey",
- .version = {1, 4, 0},
+ .version = {1, 5, 0},
+ .features = DM_TARGET_ZONED_HM,
.module = THIS_MODULE,
.ctr = flakey_ctr,
.dtr = flakey_dtr,
diff --git a/drivers/md/dm-integrity.c b/drivers/md/dm-integrity.c
new file mode 100644
index 000000000000..1b224aa9cf15
--- /dev/null
+++ b/drivers/md/dm-integrity.c
@@ -0,0 +1,3232 @@
+/*
+ * Copyright (C) 2016-2017 Red Hat, Inc. All rights reserved.
+ * Copyright (C) 2016-2017 Milan Broz
+ * Copyright (C) 2016-2017 Mikulas Patocka
+ *
+ * This file is released under the GPL.
+ */
+
+#include <linux/module.h>
+#include <linux/device-mapper.h>
+#include <linux/dm-io.h>
+#include <linux/vmalloc.h>
+#include <linux/sort.h>
+#include <linux/rbtree.h>
+#include <linux/delay.h>
+#include <linux/random.h>
+#include <crypto/hash.h>
+#include <crypto/skcipher.h>
+#include <linux/async_tx.h>
+#include "dm-bufio.h"
+
+#define DM_MSG_PREFIX "integrity"
+
+#define DEFAULT_INTERLEAVE_SECTORS 32768
+#define DEFAULT_JOURNAL_SIZE_FACTOR 7
+#define DEFAULT_BUFFER_SECTORS 128
+#define DEFAULT_JOURNAL_WATERMARK 50
+#define DEFAULT_SYNC_MSEC 10000
+#define DEFAULT_MAX_JOURNAL_SECTORS 131072
+#define MIN_LOG2_INTERLEAVE_SECTORS 3
+#define MAX_LOG2_INTERLEAVE_SECTORS 31
+#define METADATA_WORKQUEUE_MAX_ACTIVE 16
+
+/*
+ * Warning - DEBUG_PRINT prints security-sensitive data to the log,
+ * so it should not be enabled in the official kernel
+ */
+//#define DEBUG_PRINT
+//#define INTERNAL_VERIFY
+
+/*
+ * On disk structures
+ */
+
+#define SB_MAGIC "integrt"
+#define SB_VERSION 1
+#define SB_SECTORS 8
+#define MAX_SECTORS_PER_BLOCK 8
+
+struct superblock {
+ __u8 magic[8];
+ __u8 version;
+ __u8 log2_interleave_sectors;
+ __u16 integrity_tag_size;
+ __u32 journal_sections;
+ __u64 provided_data_sectors; /* userspace uses this value */
+ __u32 flags;
+ __u8 log2_sectors_per_block;
+};
+
+#define SB_FLAG_HAVE_JOURNAL_MAC 0x1
+
+#define JOURNAL_ENTRY_ROUNDUP 8
+
+typedef __u64 commit_id_t;
+#define JOURNAL_MAC_PER_SECTOR 8
+
+struct journal_entry {
+ union {
+ struct {
+ __u32 sector_lo;
+ __u32 sector_hi;
+ } s;
+ __u64 sector;
+ } u;
+ commit_id_t last_bytes[0];
+ /* __u8 tag[0]; */
+};
+
+#define journal_entry_tag(ic, je) ((__u8 *)&(je)->last_bytes[(ic)->sectors_per_block])
+
+#if BITS_PER_LONG == 64
+#define journal_entry_set_sector(je, x) do { smp_wmb(); ACCESS_ONCE((je)->u.sector) = cpu_to_le64(x); } while (0)
+#define journal_entry_get_sector(je) le64_to_cpu((je)->u.sector)
+#elif defined(CONFIG_LBDAF)
+#define journal_entry_set_sector(je, x) do { (je)->u.s.sector_lo = cpu_to_le32(x); smp_wmb(); ACCESS_ONCE((je)->u.s.sector_hi) = cpu_to_le32((x) >> 32); } while (0)
+#define journal_entry_get_sector(je) le64_to_cpu((je)->u.sector)
+#else
+#define journal_entry_set_sector(je, x) do { (je)->u.s.sector_lo = cpu_to_le32(x); smp_wmb(); ACCESS_ONCE((je)->u.s.sector_hi) = cpu_to_le32(0); } while (0)
+#define journal_entry_get_sector(je) le32_to_cpu((je)->u.s.sector_lo)
+#endif
+#define journal_entry_is_unused(je) ((je)->u.s.sector_hi == cpu_to_le32(-1))
+#define journal_entry_set_unused(je) do { ((je)->u.s.sector_hi = cpu_to_le32(-1)); } while (0)
+#define journal_entry_is_inprogress(je) ((je)->u.s.sector_hi == cpu_to_le32(-2))
+#define journal_entry_set_inprogress(je) do { ((je)->u.s.sector_hi = cpu_to_le32(-2)); } while (0)
+
+#define JOURNAL_BLOCK_SECTORS 8
+#define JOURNAL_SECTOR_DATA ((1 << SECTOR_SHIFT) - sizeof(commit_id_t))
+#define JOURNAL_MAC_SIZE (JOURNAL_MAC_PER_SECTOR * JOURNAL_BLOCK_SECTORS)
+
+struct journal_sector {
+ __u8 entries[JOURNAL_SECTOR_DATA - JOURNAL_MAC_PER_SECTOR];
+ __u8 mac[JOURNAL_MAC_PER_SECTOR];
+ commit_id_t commit_id;
+};
+
+#define MAX_TAG_SIZE (JOURNAL_SECTOR_DATA - JOURNAL_MAC_PER_SECTOR - offsetof(struct journal_entry, last_bytes[MAX_SECTORS_PER_BLOCK]))
+
+#define METADATA_PADDING_SECTORS 8
+
+#define N_COMMIT_IDS 4
+
+static unsigned char prev_commit_seq(unsigned char seq)
+{
+ return (seq + N_COMMIT_IDS - 1) % N_COMMIT_IDS;
+}
+
+static unsigned char next_commit_seq(unsigned char seq)
+{
+ return (seq + 1) % N_COMMIT_IDS;
+}
+
+/*
+ * In-memory structures
+ */
+
+struct journal_node {
+ struct rb_node node;
+ sector_t sector;
+};
+
+struct alg_spec {
+ char *alg_string;
+ char *key_string;
+ __u8 *key;
+ unsigned key_size;
+};
+
+struct dm_integrity_c {
+ struct dm_dev *dev;
+ unsigned tag_size;
+ __s8 log2_tag_size;
+ sector_t start;
+ mempool_t *journal_io_mempool;
+ struct dm_io_client *io;
+ struct dm_bufio_client *bufio;
+ struct workqueue_struct *metadata_wq;
+ struct superblock *sb;
+ unsigned journal_pages;
+ struct page_list *journal;
+ struct page_list *journal_io;
+ struct page_list *journal_xor;
+
+ struct crypto_skcipher *journal_crypt;
+ struct scatterlist **journal_scatterlist;
+ struct scatterlist **journal_io_scatterlist;
+ struct skcipher_request **sk_requests;
+
+ struct crypto_shash *journal_mac;
+
+ struct journal_node *journal_tree;
+ struct rb_root journal_tree_root;
+
+ sector_t provided_data_sectors;
+
+ unsigned short journal_entry_size;
+ unsigned char journal_entries_per_sector;
+ unsigned char journal_section_entries;
+ unsigned short journal_section_sectors;
+ unsigned journal_sections;
+ unsigned journal_entries;
+ sector_t device_sectors;
+ unsigned initial_sectors;
+ unsigned metadata_run;
+ __s8 log2_metadata_run;
+ __u8 log2_buffer_sectors;
+ __u8 sectors_per_block;
+
+ unsigned char mode;
+ bool suspending;
+
+ int failed;
+
+ struct crypto_shash *internal_hash;
+
+ /* these variables are locked with endio_wait.lock */
+ struct rb_root in_progress;
+ wait_queue_head_t endio_wait;
+ struct workqueue_struct *wait_wq;
+
+ unsigned char commit_seq;
+ commit_id_t commit_ids[N_COMMIT_IDS];
+
+ unsigned committed_section;
+ unsigned n_committed_sections;
+
+ unsigned uncommitted_section;
+ unsigned n_uncommitted_sections;
+
+ unsigned free_section;
+ unsigned char free_section_entry;
+ unsigned free_sectors;
+
+ unsigned free_sectors_threshold;
+
+ struct workqueue_struct *commit_wq;
+ struct work_struct commit_work;
+
+ struct workqueue_struct *writer_wq;
+ struct work_struct writer_work;
+
+ struct bio_list flush_bio_list;
+
+ unsigned long autocommit_jiffies;
+ struct timer_list autocommit_timer;
+ unsigned autocommit_msec;
+
+ wait_queue_head_t copy_to_journal_wait;
+
+ struct completion crypto_backoff;
+
+ bool journal_uptodate;
+ bool just_formatted;
+
+ struct alg_spec internal_hash_alg;
+ struct alg_spec journal_crypt_alg;
+ struct alg_spec journal_mac_alg;
+};
+
+struct dm_integrity_range {
+ sector_t logical_sector;
+ unsigned n_sectors;
+ struct rb_node node;
+};
+
+struct dm_integrity_io {
+ struct work_struct work;
+
+ struct dm_integrity_c *ic;
+ bool write;
+ bool fua;
+
+ struct dm_integrity_range range;
+
+ sector_t metadata_block;
+ unsigned metadata_offset;
+
+ atomic_t in_flight;
+ blk_status_t bi_status;
+
+ struct completion *completion;
+
+ struct block_device *orig_bi_bdev;
+ bio_end_io_t *orig_bi_end_io;
+ struct bio_integrity_payload *orig_bi_integrity;
+ struct bvec_iter orig_bi_iter;
+};
+
+struct journal_completion {
+ struct dm_integrity_c *ic;
+ atomic_t in_flight;
+ struct completion comp;
+};
+
+struct journal_io {
+ struct dm_integrity_range range;
+ struct journal_completion *comp;
+};
+
+static struct kmem_cache *journal_io_cache;
+
+#define JOURNAL_IO_MEMPOOL 32
+
+#ifdef DEBUG_PRINT
+#define DEBUG_print(x, ...) printk(KERN_DEBUG x, ##__VA_ARGS__)
+static void __DEBUG_bytes(__u8 *bytes, size_t len, const char *msg, ...)
+{
+ va_list args;
+ va_start(args, msg);
+ vprintk(msg, args);
+ va_end(args);
+ if (len)
+ pr_cont(":");
+ while (len) {
+ pr_cont(" %02x", *bytes);
+ bytes++;
+ len--;
+ }
+ pr_cont("\n");
+}
+#define DEBUG_bytes(bytes, len, msg, ...) __DEBUG_bytes(bytes, len, KERN_DEBUG msg, ##__VA_ARGS__)
+#else
+#define DEBUG_print(x, ...) do { } while (0)
+#define DEBUG_bytes(bytes, len, msg, ...) do { } while (0)
+#endif
+
+/*
+ * DM Integrity profile, protection is performed layer above (dm-crypt)
+ */
+static struct blk_integrity_profile dm_integrity_profile = {
+ .name = "DM-DIF-EXT-TAG",
+ .generate_fn = NULL,
+ .verify_fn = NULL,
+};
+
+static void dm_integrity_map_continue(struct dm_integrity_io *dio, bool from_map);
+static void integrity_bio_wait(struct work_struct *w);
+static void dm_integrity_dtr(struct dm_target *ti);
+
+static void dm_integrity_io_error(struct dm_integrity_c *ic, const char *msg, int err)
+{
+ if (!cmpxchg(&ic->failed, 0, err))
+ DMERR("Error on %s: %d", msg, err);
+}
+
+static int dm_integrity_failed(struct dm_integrity_c *ic)
+{
+ return ACCESS_ONCE(ic->failed);
+}
+
+static commit_id_t dm_integrity_commit_id(struct dm_integrity_c *ic, unsigned i,
+ unsigned j, unsigned char seq)
+{
+ /*
+ * Xor the number with section and sector, so that if a piece of
+ * journal is written at wrong place, it is detected.
+ */
+ return ic->commit_ids[seq] ^ cpu_to_le64(((__u64)i << 32) ^ j);
+}
+
+static void get_area_and_offset(struct dm_integrity_c *ic, sector_t data_sector,
+ sector_t *area, sector_t *offset)
+{
+ __u8 log2_interleave_sectors = ic->sb->log2_interleave_sectors;
+
+ *area = data_sector >> log2_interleave_sectors;
+ *offset = (unsigned)data_sector & ((1U << log2_interleave_sectors) - 1);
+}
+
+#define sector_to_block(ic, n) \
+do { \
+ BUG_ON((n) & (unsigned)((ic)->sectors_per_block - 1)); \
+ (n) >>= (ic)->sb->log2_sectors_per_block; \
+} while (0)
+
+static __u64 get_metadata_sector_and_offset(struct dm_integrity_c *ic, sector_t area,
+ sector_t offset, unsigned *metadata_offset)
+{
+ __u64 ms;
+ unsigned mo;
+
+ ms = area << ic->sb->log2_interleave_sectors;
+ if (likely(ic->log2_metadata_run >= 0))
+ ms += area << ic->log2_metadata_run;
+ else
+ ms += area * ic->metadata_run;
+ ms >>= ic->log2_buffer_sectors;
+
+ sector_to_block(ic, offset);
+
+ if (likely(ic->log2_tag_size >= 0)) {
+ ms += offset >> (SECTOR_SHIFT + ic->log2_buffer_sectors - ic->log2_tag_size);
+ mo = (offset << ic->log2_tag_size) & ((1U << SECTOR_SHIFT << ic->log2_buffer_sectors) - 1);
+ } else {
+ ms += (__u64)offset * ic->tag_size >> (SECTOR_SHIFT + ic->log2_buffer_sectors);
+ mo = (offset * ic->tag_size) & ((1U << SECTOR_SHIFT << ic->log2_buffer_sectors) - 1);
+ }
+ *metadata_offset = mo;
+ return ms;
+}
+
+static sector_t get_data_sector(struct dm_integrity_c *ic, sector_t area, sector_t offset)
+{
+ sector_t result;
+
+ result = area << ic->sb->log2_interleave_sectors;
+ if (likely(ic->log2_metadata_run >= 0))
+ result += (area + 1) << ic->log2_metadata_run;
+ else
+ result += (area + 1) * ic->metadata_run;
+
+ result += (sector_t)ic->initial_sectors + offset;
+ return result;
+}
+
+static void wraparound_section(struct dm_integrity_c *ic, unsigned *sec_ptr)
+{
+ if (unlikely(*sec_ptr >= ic->journal_sections))
+ *sec_ptr -= ic->journal_sections;
+}
+
+static int sync_rw_sb(struct dm_integrity_c *ic, int op, int op_flags)
+{
+ struct dm_io_request io_req;
+ struct dm_io_region io_loc;
+
+ io_req.bi_op = op;
+ io_req.bi_op_flags = op_flags;
+ io_req.mem.type = DM_IO_KMEM;
+ io_req.mem.ptr.addr = ic->sb;
+ io_req.notify.fn = NULL;
+ io_req.client = ic->io;
+ io_loc.bdev = ic->dev->bdev;
+ io_loc.sector = ic->start;
+ io_loc.count = SB_SECTORS;
+
+ return dm_io(&io_req, 1, &io_loc, NULL);
+}
+
+static void access_journal_check(struct dm_integrity_c *ic, unsigned section, unsigned offset,
+ bool e, const char *function)
+{
+#if defined(CONFIG_DM_DEBUG) || defined(INTERNAL_VERIFY)
+ unsigned limit = e ? ic->journal_section_entries : ic->journal_section_sectors;
+
+ if (unlikely(section >= ic->journal_sections) ||
+ unlikely(offset >= limit)) {
+ printk(KERN_CRIT "%s: invalid access at (%u,%u), limit (%u,%u)\n",
+ function, section, offset, ic->journal_sections, limit);
+ BUG();
+ }
+#endif
+}
+
+static void page_list_location(struct dm_integrity_c *ic, unsigned section, unsigned offset,
+ unsigned *pl_index, unsigned *pl_offset)
+{
+ unsigned sector;
+
+ access_journal_check(ic, section, offset, false, "page_list_location");
+
+ sector = section * ic->journal_section_sectors + offset;
+
+ *pl_index = sector >> (PAGE_SHIFT - SECTOR_SHIFT);
+ *pl_offset = (sector << SECTOR_SHIFT) & (PAGE_SIZE - 1);
+}
+
+static struct journal_sector *access_page_list(struct dm_integrity_c *ic, struct page_list *pl,
+ unsigned section, unsigned offset, unsigned *n_sectors)
+{
+ unsigned pl_index, pl_offset;
+ char *va;
+
+ page_list_location(ic, section, offset, &pl_index, &pl_offset);
+
+ if (n_sectors)
+ *n_sectors = (PAGE_SIZE - pl_offset) >> SECTOR_SHIFT;
+
+ va = lowmem_page_address(pl[pl_index].page);
+
+ return (struct journal_sector *)(va + pl_offset);
+}
+
+static struct journal_sector *access_journal(struct dm_integrity_c *ic, unsigned section, unsigned offset)
+{
+ return access_page_list(ic, ic->journal, section, offset, NULL);
+}
+
+static struct journal_entry *access_journal_entry(struct dm_integrity_c *ic, unsigned section, unsigned n)
+{
+ unsigned rel_sector, offset;
+ struct journal_sector *js;
+
+ access_journal_check(ic, section, n, true, "access_journal_entry");
+
+ rel_sector = n % JOURNAL_BLOCK_SECTORS;
+ offset = n / JOURNAL_BLOCK_SECTORS;
+
+ js = access_journal(ic, section, rel_sector);
+ return (struct journal_entry *)((char *)js + offset * ic->journal_entry_size);
+}
+
+static struct journal_sector *access_journal_data(struct dm_integrity_c *ic, unsigned section, unsigned n)
+{
+ n <<= ic->sb->log2_sectors_per_block;
+
+ n += JOURNAL_BLOCK_SECTORS;
+
+ access_journal_check(ic, section, n, false, "access_journal_data");
+
+ return access_journal(ic, section, n);
+}
+
+static void section_mac(struct dm_integrity_c *ic, unsigned section, __u8 result[JOURNAL_MAC_SIZE])
+{
+ SHASH_DESC_ON_STACK(desc, ic->journal_mac);
+ int r;
+ unsigned j, size;
+
+ desc->tfm = ic->journal_mac;
+ desc->flags = CRYPTO_TFM_REQ_MAY_SLEEP;
+
+ r = crypto_shash_init(desc);
+ if (unlikely(r)) {
+ dm_integrity_io_error(ic, "crypto_shash_init", r);
+ goto err;
+ }
+
+ for (j = 0; j < ic->journal_section_entries; j++) {
+ struct journal_entry *je = access_journal_entry(ic, section, j);
+ r = crypto_shash_update(desc, (__u8 *)&je->u.sector, sizeof je->u.sector);
+ if (unlikely(r)) {
+ dm_integrity_io_error(ic, "crypto_shash_update", r);
+ goto err;
+ }
+ }
+
+ size = crypto_shash_digestsize(ic->journal_mac);
+
+ if (likely(size <= JOURNAL_MAC_SIZE)) {
+ r = crypto_shash_final(desc, result);
+ if (unlikely(r)) {
+ dm_integrity_io_error(ic, "crypto_shash_final", r);
+ goto err;
+ }
+ memset(result + size, 0, JOURNAL_MAC_SIZE - size);
+ } else {
+ __u8 digest[size];
+ r = crypto_shash_final(desc, digest);
+ if (unlikely(r)) {
+ dm_integrity_io_error(ic, "crypto_shash_final", r);
+ goto err;
+ }
+ memcpy(result, digest, JOURNAL_MAC_SIZE);
+ }
+
+ return;
+err:
+ memset(result, 0, JOURNAL_MAC_SIZE);
+}
+
+static void rw_section_mac(struct dm_integrity_c *ic, unsigned section, bool wr)
+{
+ __u8 result[JOURNAL_MAC_SIZE];
+ unsigned j;
+
+ if (!ic->journal_mac)
+ return;
+
+ section_mac(ic, section, result);
+
+ for (j = 0; j < JOURNAL_BLOCK_SECTORS; j++) {
+ struct journal_sector *js = access_journal(ic, section, j);
+
+ if (likely(wr))
+ memcpy(&js->mac, result + (j * JOURNAL_MAC_PER_SECTOR), JOURNAL_MAC_PER_SECTOR);
+ else {
+ if (memcmp(&js->mac, result + (j * JOURNAL_MAC_PER_SECTOR), JOURNAL_MAC_PER_SECTOR))
+ dm_integrity_io_error(ic, "journal mac", -EILSEQ);
+ }
+ }
+}
+
+static void complete_journal_op(void *context)
+{
+ struct journal_completion *comp = context;
+ BUG_ON(!atomic_read(&comp->in_flight));
+ if (likely(atomic_dec_and_test(&comp->in_flight)))
+ complete(&comp->comp);
+}
+
+static void xor_journal(struct dm_integrity_c *ic, bool encrypt, unsigned section,
+ unsigned n_sections, struct journal_completion *comp)
+{
+ struct async_submit_ctl submit;
+ size_t n_bytes = (size_t)(n_sections * ic->journal_section_sectors) << SECTOR_SHIFT;
+ unsigned pl_index, pl_offset, section_index;
+ struct page_list *source_pl, *target_pl;
+
+ if (likely(encrypt)) {
+ source_pl = ic->journal;
+ target_pl = ic->journal_io;
+ } else {
+ source_pl = ic->journal_io;
+ target_pl = ic->journal;
+ }
+
+ page_list_location(ic, section, 0, &pl_index, &pl_offset);
+
+ atomic_add(roundup(pl_offset + n_bytes, PAGE_SIZE) >> PAGE_SHIFT, &comp->in_flight);
+
+ init_async_submit(&submit, ASYNC_TX_XOR_ZERO_DST, NULL, complete_journal_op, comp, NULL);
+
+ section_index = pl_index;
+
+ do {
+ size_t this_step;
+ struct page *src_pages[2];
+ struct page *dst_page;
+
+ while (unlikely(pl_index == section_index)) {
+ unsigned dummy;
+ if (likely(encrypt))
+ rw_section_mac(ic, section, true);
+ section++;
+ n_sections--;
+ if (!n_sections)
+ break;
+ page_list_location(ic, section, 0, &section_index, &dummy);
+ }
+
+ this_step = min(n_bytes, (size_t)PAGE_SIZE - pl_offset);
+ dst_page = target_pl[pl_index].page;
+ src_pages[0] = source_pl[pl_index].page;
+ src_pages[1] = ic->journal_xor[pl_index].page;
+
+ async_xor(dst_page, src_pages, pl_offset, 2, this_step, &submit);
+
+ pl_index++;
+ pl_offset = 0;
+ n_bytes -= this_step;
+ } while (n_bytes);
+
+ BUG_ON(n_sections);
+
+ async_tx_issue_pending_all();
+}
+
+static void complete_journal_encrypt(struct crypto_async_request *req, int err)
+{
+ struct journal_completion *comp = req->data;
+ if (unlikely(err)) {
+ if (likely(err == -EINPROGRESS)) {
+ complete(&comp->ic->crypto_backoff);
+ return;
+ }
+ dm_integrity_io_error(comp->ic, "asynchronous encrypt", err);
+ }
+ complete_journal_op(comp);
+}
+
+static bool do_crypt(bool encrypt, struct skcipher_request *req, struct journal_completion *comp)
+{
+ int r;
+ skcipher_request_set_callback(req, CRYPTO_TFM_REQ_MAY_BACKLOG | CRYPTO_TFM_REQ_MAY_SLEEP,
+ complete_journal_encrypt, comp);
+ if (likely(encrypt))
+ r = crypto_skcipher_encrypt(req);
+ else
+ r = crypto_skcipher_decrypt(req);
+ if (likely(!r))
+ return false;
+ if (likely(r == -EINPROGRESS))
+ return true;
+ if (likely(r == -EBUSY)) {
+ wait_for_completion(&comp->ic->crypto_backoff);
+ reinit_completion(&comp->ic->crypto_backoff);
+ return true;
+ }
+ dm_integrity_io_error(comp->ic, "encrypt", r);
+ return false;
+}
+
+static void crypt_journal(struct dm_integrity_c *ic, bool encrypt, unsigned section,
+ unsigned n_sections, struct journal_completion *comp)
+{
+ struct scatterlist **source_sg;
+ struct scatterlist **target_sg;
+
+ atomic_add(2, &comp->in_flight);
+
+ if (likely(encrypt)) {
+ source_sg = ic->journal_scatterlist;
+ target_sg = ic->journal_io_scatterlist;
+ } else {
+ source_sg = ic->journal_io_scatterlist;
+ target_sg = ic->journal_scatterlist;
+ }
+
+ do {
+ struct skcipher_request *req;
+ unsigned ivsize;
+ char *iv;
+
+ if (likely(encrypt))
+ rw_section_mac(ic, section, true);
+
+ req = ic->sk_requests[section];
+ ivsize = crypto_skcipher_ivsize(ic->journal_crypt);
+ iv = req->iv;
+
+ memcpy(iv, iv + ivsize, ivsize);
+
+ req->src = source_sg[section];
+ req->dst = target_sg[section];
+
+ if (unlikely(do_crypt(encrypt, req, comp)))
+ atomic_inc(&comp->in_flight);
+
+ section++;
+ n_sections--;
+ } while (n_sections);
+
+ atomic_dec(&comp->in_flight);
+ complete_journal_op(comp);
+}
+
+static void encrypt_journal(struct dm_integrity_c *ic, bool encrypt, unsigned section,
+ unsigned n_sections, struct journal_completion *comp)
+{
+ if (ic->journal_xor)
+ return xor_journal(ic, encrypt, section, n_sections, comp);
+ else
+ return crypt_journal(ic, encrypt, section, n_sections, comp);
+}
+
+static void complete_journal_io(unsigned long error, void *context)
+{
+ struct journal_completion *comp = context;
+ if (unlikely(error != 0))
+ dm_integrity_io_error(comp->ic, "writing journal", -EIO);
+ complete_journal_op(comp);
+}
+
+static void rw_journal(struct dm_integrity_c *ic, int op, int op_flags, unsigned section,
+ unsigned n_sections, struct journal_completion *comp)
+{
+ struct dm_io_request io_req;
+ struct dm_io_region io_loc;
+ unsigned sector, n_sectors, pl_index, pl_offset;
+ int r;
+
+ if (unlikely(dm_integrity_failed(ic))) {
+ if (comp)
+ complete_journal_io(-1UL, comp);
+ return;
+ }
+
+ sector = section * ic->journal_section_sectors;
+ n_sectors = n_sections * ic->journal_section_sectors;
+
+ pl_index = sector >> (PAGE_SHIFT - SECTOR_SHIFT);
+ pl_offset = (sector << SECTOR_SHIFT) & (PAGE_SIZE - 1);
+
+ io_req.bi_op = op;
+ io_req.bi_op_flags = op_flags;
+ io_req.mem.type = DM_IO_PAGE_LIST;
+ if (ic->journal_io)
+ io_req.mem.ptr.pl = &ic->journal_io[pl_index];
+ else
+ io_req.mem.ptr.pl = &ic->journal[pl_index];
+ io_req.mem.offset = pl_offset;
+ if (likely(comp != NULL)) {
+ io_req.notify.fn = complete_journal_io;
+ io_req.notify.context = comp;
+ } else {
+ io_req.notify.fn = NULL;
+ }
+ io_req.client = ic->io;
+ io_loc.bdev = ic->dev->bdev;
+ io_loc.sector = ic->start + SB_SECTORS + sector;
+ io_loc.count = n_sectors;
+
+ r = dm_io(&io_req, 1, &io_loc, NULL);
+ if (unlikely(r)) {
+ dm_integrity_io_error(ic, op == REQ_OP_READ ? "reading journal" : "writing journal", r);
+ if (comp) {
+ WARN_ONCE(1, "asynchronous dm_io failed: %d", r);
+ complete_journal_io(-1UL, comp);
+ }
+ }
+}
+
+static void write_journal(struct dm_integrity_c *ic, unsigned commit_start, unsigned commit_sections)
+{
+ struct journal_completion io_comp;
+ struct journal_completion crypt_comp_1;
+ struct journal_completion crypt_comp_2;
+ unsigned i;
+
+ io_comp.ic = ic;
+ io_comp.comp = COMPLETION_INITIALIZER_ONSTACK(io_comp.comp);
+
+ if (commit_start + commit_sections <= ic->journal_sections) {
+ io_comp.in_flight = (atomic_t)ATOMIC_INIT(1);
+ if (ic->journal_io) {
+ crypt_comp_1.ic = ic;
+ crypt_comp_1.comp = COMPLETION_INITIALIZER_ONSTACK(crypt_comp_1.comp);
+ crypt_comp_1.in_flight = (atomic_t)ATOMIC_INIT(0);
+ encrypt_journal(ic, true, commit_start, commit_sections, &crypt_comp_1);
+ wait_for_completion_io(&crypt_comp_1.comp);
+ } else {
+ for (i = 0; i < commit_sections; i++)
+ rw_section_mac(ic, commit_start + i, true);
+ }
+ rw_journal(ic, REQ_OP_WRITE, REQ_FUA | REQ_SYNC, commit_start,
+ commit_sections, &io_comp);
+ } else {
+ unsigned to_end;
+ io_comp.in_flight = (atomic_t)ATOMIC_INIT(2);
+ to_end = ic->journal_sections - commit_start;
+ if (ic->journal_io) {
+ crypt_comp_1.ic = ic;
+ crypt_comp_1.comp = COMPLETION_INITIALIZER_ONSTACK(crypt_comp_1.comp);
+ crypt_comp_1.in_flight = (atomic_t)ATOMIC_INIT(0);
+ encrypt_journal(ic, true, commit_start, to_end, &crypt_comp_1);
+ if (try_wait_for_completion(&crypt_comp_1.comp)) {
+ rw_journal(ic, REQ_OP_WRITE, REQ_FUA, commit_start, to_end, &io_comp);
+ crypt_comp_1.comp = COMPLETION_INITIALIZER_ONSTACK(crypt_comp_1.comp);
+ crypt_comp_1.in_flight = (atomic_t)ATOMIC_INIT(0);
+ encrypt_journal(ic, true, 0, commit_sections - to_end, &crypt_comp_1);
+ wait_for_completion_io(&crypt_comp_1.comp);
+ } else {
+ crypt_comp_2.ic = ic;
+ crypt_comp_2.comp = COMPLETION_INITIALIZER_ONSTACK(crypt_comp_2.comp);
+ crypt_comp_2.in_flight = (atomic_t)ATOMIC_INIT(0);
+ encrypt_journal(ic, true, 0, commit_sections - to_end, &crypt_comp_2);
+ wait_for_completion_io(&crypt_comp_1.comp);
+ rw_journal(ic, REQ_OP_WRITE, REQ_FUA, commit_start, to_end, &io_comp);
+ wait_for_completion_io(&crypt_comp_2.comp);
+ }
+ } else {
+ for (i = 0; i < to_end; i++)
+ rw_section_mac(ic, commit_start + i, true);
+ rw_journal(ic, REQ_OP_WRITE, REQ_FUA, commit_start, to_end, &io_comp);
+ for (i = 0; i < commit_sections - to_end; i++)
+ rw_section_mac(ic, i, true);
+ }
+ rw_journal(ic, REQ_OP_WRITE, REQ_FUA, 0, commit_sections - to_end, &io_comp);
+ }
+
+ wait_for_completion_io(&io_comp.comp);
+}
+
+static void copy_from_journal(struct dm_integrity_c *ic, unsigned section, unsigned offset,
+ unsigned n_sectors, sector_t target, io_notify_fn fn, void *data)
+{
+ struct dm_io_request io_req;
+ struct dm_io_region io_loc;
+ int r;
+ unsigned sector, pl_index, pl_offset;
+
+ BUG_ON((target | n_sectors | offset) & (unsigned)(ic->sectors_per_block - 1));
+
+ if (unlikely(dm_integrity_failed(ic))) {
+ fn(-1UL, data);
+ return;
+ }
+
+ sector = section * ic->journal_section_sectors + JOURNAL_BLOCK_SECTORS + offset;
+
+ pl_index = sector >> (PAGE_SHIFT - SECTOR_SHIFT);
+ pl_offset = (sector << SECTOR_SHIFT) & (PAGE_SIZE - 1);
+
+ io_req.bi_op = REQ_OP_WRITE;
+ io_req.bi_op_flags = 0;
+ io_req.mem.type = DM_IO_PAGE_LIST;
+ io_req.mem.ptr.pl = &ic->journal[pl_index];
+ io_req.mem.offset = pl_offset;
+ io_req.notify.fn = fn;
+ io_req.notify.context = data;
+ io_req.client = ic->io;
+ io_loc.bdev = ic->dev->bdev;
+ io_loc.sector = ic->start + target;
+ io_loc.count = n_sectors;
+
+ r = dm_io(&io_req, 1, &io_loc, NULL);
+ if (unlikely(r)) {
+ WARN_ONCE(1, "asynchronous dm_io failed: %d", r);
+ fn(-1UL, data);
+ }
+}
+
+static bool add_new_range(struct dm_integrity_c *ic, struct dm_integrity_range *new_range)
+{
+ struct rb_node **n = &ic->in_progress.rb_node;
+ struct rb_node *parent;
+
+ BUG_ON((new_range->logical_sector | new_range->n_sectors) & (unsigned)(ic->sectors_per_block - 1));
+
+ parent = NULL;
+
+ while (*n) {
+ struct dm_integrity_range *range = container_of(*n, struct dm_integrity_range, node);
+
+ parent = *n;
+ if (new_range->logical_sector + new_range->n_sectors <= range->logical_sector) {
+ n = &range->node.rb_left;
+ } else if (new_range->logical_sector >= range->logical_sector + range->n_sectors) {
+ n = &range->node.rb_right;
+ } else {
+ return false;
+ }
+ }
+
+ rb_link_node(&new_range->node, parent, n);
+ rb_insert_color(&new_range->node, &ic->in_progress);
+
+ return true;
+}
+
+static void remove_range_unlocked(struct dm_integrity_c *ic, struct dm_integrity_range *range)
+{
+ rb_erase(&range->node, &ic->in_progress);
+ wake_up_locked(&ic->endio_wait);
+}
+
+static void remove_range(struct dm_integrity_c *ic, struct dm_integrity_range *range)
+{
+ unsigned long flags;
+
+ spin_lock_irqsave(&ic->endio_wait.lock, flags);
+ remove_range_unlocked(ic, range);
+ spin_unlock_irqrestore(&ic->endio_wait.lock, flags);
+}
+
+static void init_journal_node(struct journal_node *node)
+{
+ RB_CLEAR_NODE(&node->node);
+ node->sector = (sector_t)-1;
+}
+
+static void add_journal_node(struct dm_integrity_c *ic, struct journal_node *node, sector_t sector)
+{
+ struct rb_node **link;
+ struct rb_node *parent;
+
+ node->sector = sector;
+ BUG_ON(!RB_EMPTY_NODE(&node->node));
+
+ link = &ic->journal_tree_root.rb_node;
+ parent = NULL;
+
+ while (*link) {
+ struct journal_node *j;
+ parent = *link;
+ j = container_of(parent, struct journal_node, node);
+ if (sector < j->sector)
+ link = &j->node.rb_left;
+ else
+ link = &j->node.rb_right;
+ }
+
+ rb_link_node(&node->node, parent, link);
+ rb_insert_color(&node->node, &ic->journal_tree_root);
+}
+
+static void remove_journal_node(struct dm_integrity_c *ic, struct journal_node *node)
+{
+ BUG_ON(RB_EMPTY_NODE(&node->node));
+ rb_erase(&node->node, &ic->journal_tree_root);
+ init_journal_node(node);
+}
+
+#define NOT_FOUND (-1U)
+
+static unsigned find_journal_node(struct dm_integrity_c *ic, sector_t sector, sector_t *next_sector)
+{
+ struct rb_node *n = ic->journal_tree_root.rb_node;
+ unsigned found = NOT_FOUND;
+ *next_sector = (sector_t)-1;
+ while (n) {
+ struct journal_node *j = container_of(n, struct journal_node, node);
+ if (sector == j->sector) {
+ found = j - ic->journal_tree;
+ }
+ if (sector < j->sector) {
+ *next_sector = j->sector;
+ n = j->node.rb_left;
+ } else {
+ n = j->node.rb_right;
+ }
+ }
+
+ return found;
+}
+
+static bool test_journal_node(struct dm_integrity_c *ic, unsigned pos, sector_t sector)
+{
+ struct journal_node *node, *next_node;
+ struct rb_node *next;
+
+ if (unlikely(pos >= ic->journal_entries))
+ return false;
+ node = &ic->journal_tree[pos];
+ if (unlikely(RB_EMPTY_NODE(&node->node)))
+ return false;
+ if (unlikely(node->sector != sector))
+ return false;
+
+ next = rb_next(&node->node);
+ if (unlikely(!next))
+ return true;
+
+ next_node = container_of(next, struct journal_node, node);
+ return next_node->sector != sector;
+}
+
+static bool find_newer_committed_node(struct dm_integrity_c *ic, struct journal_node *node)
+{
+ struct rb_node *next;
+ struct journal_node *next_node;
+ unsigned next_section;
+
+ BUG_ON(RB_EMPTY_NODE(&node->node));
+
+ next = rb_next(&node->node);
+ if (unlikely(!next))
+ return false;
+
+ next_node = container_of(next, struct journal_node, node);
+
+ if (next_node->sector != node->sector)
+ return false;
+
+ next_section = (unsigned)(next_node - ic->journal_tree) / ic->journal_section_entries;
+ if (next_section >= ic->committed_section &&
+ next_section < ic->committed_section + ic->n_committed_sections)
+ return true;
+ if (next_section + ic->journal_sections < ic->committed_section + ic->n_committed_sections)
+ return true;
+
+ return false;
+}
+
+#define TAG_READ 0
+#define TAG_WRITE 1
+#define TAG_CMP 2
+
+static int dm_integrity_rw_tag(struct dm_integrity_c *ic, unsigned char *tag, sector_t *metadata_block,
+ unsigned *metadata_offset, unsigned total_size, int op)
+{
+ do {
+ unsigned char *data, *dp;
+ struct dm_buffer *b;
+ unsigned to_copy;
+ int r;
+
+ r = dm_integrity_failed(ic);
+ if (unlikely(r))
+ return r;
+
+ data = dm_bufio_read(ic->bufio, *metadata_block, &b);
+ if (unlikely(IS_ERR(data)))
+ return PTR_ERR(data);
+
+ to_copy = min((1U << SECTOR_SHIFT << ic->log2_buffer_sectors) - *metadata_offset, total_size);
+ dp = data + *metadata_offset;
+ if (op == TAG_READ) {
+ memcpy(tag, dp, to_copy);
+ } else if (op == TAG_WRITE) {
+ memcpy(dp, tag, to_copy);
+ dm_bufio_mark_buffer_dirty(b);
+ } else {
+ /* e.g.: op == TAG_CMP */
+ if (unlikely(memcmp(dp, tag, to_copy))) {
+ unsigned i;
+
+ for (i = 0; i < to_copy; i++) {
+ if (dp[i] != tag[i])
+ break;
+ total_size--;
+ }
+ dm_bufio_release(b);
+ return total_size;
+ }
+ }
+ dm_bufio_release(b);
+
+ tag += to_copy;
+ *metadata_offset += to_copy;
+ if (unlikely(*metadata_offset == 1U << SECTOR_SHIFT << ic->log2_buffer_sectors)) {
+ (*metadata_block)++;
+ *metadata_offset = 0;
+ }
+ total_size -= to_copy;
+ } while (unlikely(total_size));
+
+ return 0;
+}
+
+static void dm_integrity_flush_buffers(struct dm_integrity_c *ic)
+{
+ int r;
+ r = dm_bufio_write_dirty_buffers(ic->bufio);
+ if (unlikely(r))
+ dm_integrity_io_error(ic, "writing tags", r);
+}
+
+static void sleep_on_endio_wait(struct dm_integrity_c *ic)
+{
+ DECLARE_WAITQUEUE(wait, current);
+ __add_wait_queue(&ic->endio_wait, &wait);
+ __set_current_state(TASK_UNINTERRUPTIBLE);
+ spin_unlock_irq(&ic->endio_wait.lock);
+ io_schedule();
+ spin_lock_irq(&ic->endio_wait.lock);
+ __remove_wait_queue(&ic->endio_wait, &wait);
+}
+
+static void autocommit_fn(unsigned long data)
+{
+ struct dm_integrity_c *ic = (struct dm_integrity_c *)data;
+
+ if (likely(!dm_integrity_failed(ic)))
+ queue_work(ic->commit_wq, &ic->commit_work);
+}
+
+static void schedule_autocommit(struct dm_integrity_c *ic)
+{
+ if (!timer_pending(&ic->autocommit_timer))
+ mod_timer(&ic->autocommit_timer, jiffies + ic->autocommit_jiffies);
+}
+
+static void submit_flush_bio(struct dm_integrity_c *ic, struct dm_integrity_io *dio)
+{
+ struct bio *bio;
+ unsigned long flags;
+
+ spin_lock_irqsave(&ic->endio_wait.lock, flags);
+ bio = dm_bio_from_per_bio_data(dio, sizeof(struct dm_integrity_io));
+ bio_list_add(&ic->flush_bio_list, bio);
+ spin_unlock_irqrestore(&ic->endio_wait.lock, flags);
+
+ queue_work(ic->commit_wq, &ic->commit_work);
+}
+
+static void do_endio(struct dm_integrity_c *ic, struct bio *bio)
+{
+ int r = dm_integrity_failed(ic);
+ if (unlikely(r) && !bio->bi_status)
+ bio->bi_status = errno_to_blk_status(r);
+ bio_endio(bio);
+}
+
+static void do_endio_flush(struct dm_integrity_c *ic, struct dm_integrity_io *dio)
+{
+ struct bio *bio = dm_bio_from_per_bio_data(dio, sizeof(struct dm_integrity_io));
+
+ if (unlikely(dio->fua) && likely(!bio->bi_status) && likely(!dm_integrity_failed(ic)))
+ submit_flush_bio(ic, dio);
+ else
+ do_endio(ic, bio);
+}
+
+static void dec_in_flight(struct dm_integrity_io *dio)
+{
+ if (atomic_dec_and_test(&dio->in_flight)) {
+ struct dm_integrity_c *ic = dio->ic;
+ struct bio *bio;
+
+ remove_range(ic, &dio->range);
+
+ if (unlikely(dio->write))
+ schedule_autocommit(ic);
+
+ bio = dm_bio_from_per_bio_data(dio, sizeof(struct dm_integrity_io));
+
+ if (unlikely(dio->bi_status) && !bio->bi_status)
+ bio->bi_status = dio->bi_status;
+ if (likely(!bio->bi_status) && unlikely(bio_sectors(bio) != dio->range.n_sectors)) {
+ dio->range.logical_sector += dio->range.n_sectors;
+ bio_advance(bio, dio->range.n_sectors << SECTOR_SHIFT);
+ INIT_WORK(&dio->work, integrity_bio_wait);
+ queue_work(ic->wait_wq, &dio->work);
+ return;
+ }
+ do_endio_flush(ic, dio);
+ }
+}
+
+static void integrity_end_io(struct bio *bio)
+{
+ struct dm_integrity_io *dio = dm_per_bio_data(bio, sizeof(struct dm_integrity_io));
+
+ bio->bi_iter = dio->orig_bi_iter;
+ bio->bi_bdev = dio->orig_bi_bdev;
+ if (dio->orig_bi_integrity) {
+ bio->bi_integrity = dio->orig_bi_integrity;
+ bio->bi_opf |= REQ_INTEGRITY;
+ }
+ bio->bi_end_io = dio->orig_bi_end_io;
+
+ if (dio->completion)
+ complete(dio->completion);
+
+ dec_in_flight(dio);
+}
+
+static void integrity_sector_checksum(struct dm_integrity_c *ic, sector_t sector,
+ const char *data, char *result)
+{
+ __u64 sector_le = cpu_to_le64(sector);
+ SHASH_DESC_ON_STACK(req, ic->internal_hash);
+ int r;
+ unsigned digest_size;
+
+ req->tfm = ic->internal_hash;
+ req->flags = 0;
+
+ r = crypto_shash_init(req);
+ if (unlikely(r < 0)) {
+ dm_integrity_io_error(ic, "crypto_shash_init", r);
+ goto failed;
+ }
+
+ r = crypto_shash_update(req, (const __u8 *)&sector_le, sizeof sector_le);
+ if (unlikely(r < 0)) {
+ dm_integrity_io_error(ic, "crypto_shash_update", r);
+ goto failed;
+ }
+
+ r = crypto_shash_update(req, data, ic->sectors_per_block << SECTOR_SHIFT);
+ if (unlikely(r < 0)) {
+ dm_integrity_io_error(ic, "crypto_shash_update", r);
+ goto failed;
+ }
+
+ r = crypto_shash_final(req, result);
+ if (unlikely(r < 0)) {
+ dm_integrity_io_error(ic, "crypto_shash_final", r);
+ goto failed;
+ }
+
+ digest_size = crypto_shash_digestsize(ic->internal_hash);
+ if (unlikely(digest_size < ic->tag_size))
+ memset(result + digest_size, 0, ic->tag_size - digest_size);
+
+ return;
+
+failed:
+ /* this shouldn't happen anyway, the hash functions have no reason to fail */
+ get_random_bytes(result, ic->tag_size);
+}
+
+static void integrity_metadata(struct work_struct *w)
+{
+ struct dm_integrity_io *dio = container_of(w, struct dm_integrity_io, work);
+ struct dm_integrity_c *ic = dio->ic;
+
+ int r;
+
+ if (ic->internal_hash) {
+ struct bvec_iter iter;
+ struct bio_vec bv;
+ unsigned digest_size = crypto_shash_digestsize(ic->internal_hash);
+ struct bio *bio = dm_bio_from_per_bio_data(dio, sizeof(struct dm_integrity_io));
+ char *checksums;
+ unsigned extra_space = unlikely(digest_size > ic->tag_size) ? digest_size - ic->tag_size : 0;
+ char checksums_onstack[ic->tag_size + extra_space];
+ unsigned sectors_to_process = dio->range.n_sectors;
+ sector_t sector = dio->range.logical_sector;
+
+ if (unlikely(ic->mode == 'R'))
+ goto skip_io;
+
+ checksums = kmalloc((PAGE_SIZE >> SECTOR_SHIFT >> ic->sb->log2_sectors_per_block) * ic->tag_size + extra_space,
+ GFP_NOIO | __GFP_NORETRY | __GFP_NOWARN);
+ if (!checksums)
+ checksums = checksums_onstack;
+
+ __bio_for_each_segment(bv, bio, iter, dio->orig_bi_iter) {
+ unsigned pos;
+ char *mem, *checksums_ptr;
+
+again:
+ mem = (char *)kmap_atomic(bv.bv_page) + bv.bv_offset;
+ pos = 0;
+ checksums_ptr = checksums;
+ do {
+ integrity_sector_checksum(ic, sector, mem + pos, checksums_ptr);
+ checksums_ptr += ic->tag_size;
+ sectors_to_process -= ic->sectors_per_block;
+ pos += ic->sectors_per_block << SECTOR_SHIFT;
+ sector += ic->sectors_per_block;
+ } while (pos < bv.bv_len && sectors_to_process && checksums != checksums_onstack);
+ kunmap_atomic(mem);
+
+ r = dm_integrity_rw_tag(ic, checksums, &dio->metadata_block, &dio->metadata_offset,
+ checksums_ptr - checksums, !dio->write ? TAG_CMP : TAG_WRITE);
+ if (unlikely(r)) {
+ if (r > 0) {
+ DMERR("Checksum failed at sector 0x%llx",
+ (unsigned long long)(sector - ((r + ic->tag_size - 1) / ic->tag_size)));
+ r = -EILSEQ;
+ }
+ if (likely(checksums != checksums_onstack))
+ kfree(checksums);
+ goto error;
+ }
+
+ if (!sectors_to_process)
+ break;
+
+ if (unlikely(pos < bv.bv_len)) {
+ bv.bv_offset += pos;
+ bv.bv_len -= pos;
+ goto again;
+ }
+ }
+
+ if (likely(checksums != checksums_onstack))
+ kfree(checksums);
+ } else {
+ struct bio_integrity_payload *bip = dio->orig_bi_integrity;
+
+ if (bip) {
+ struct bio_vec biv;
+ struct bvec_iter iter;
+ unsigned data_to_process = dio->range.n_sectors;
+ sector_to_block(ic, data_to_process);
+ data_to_process *= ic->tag_size;
+
+ bip_for_each_vec(biv, bip, iter) {
+ unsigned char *tag;
+ unsigned this_len;
+
+ BUG_ON(PageHighMem(biv.bv_page));
+ tag = lowmem_page_address(biv.bv_page) + biv.bv_offset;
+ this_len = min(biv.bv_len, data_to_process);
+ r = dm_integrity_rw_tag(ic, tag, &dio->metadata_block, &dio->metadata_offset,
+ this_len, !dio->write ? TAG_READ : TAG_WRITE);
+ if (unlikely(r))
+ goto error;
+ data_to_process -= this_len;
+ if (!data_to_process)
+ break;
+ }
+ }
+ }
+skip_io:
+ dec_in_flight(dio);
+ return;
+error:
+ dio->bi_status = errno_to_blk_status(r);
+ dec_in_flight(dio);
+}
+
+static int dm_integrity_map(struct dm_target *ti, struct bio *bio)
+{
+ struct dm_integrity_c *ic = ti->private;
+ struct dm_integrity_io *dio = dm_per_bio_data(bio, sizeof(struct dm_integrity_io));
+ struct bio_integrity_payload *bip;
+
+ sector_t area, offset;
+
+ dio->ic = ic;
+ dio->bi_status = 0;
+
+ if (unlikely(bio->bi_opf & REQ_PREFLUSH)) {
+ submit_flush_bio(ic, dio);
+ return DM_MAPIO_SUBMITTED;
+ }
+
+ dio->range.logical_sector = dm_target_offset(ti, bio->bi_iter.bi_sector);
+ dio->write = bio_op(bio) == REQ_OP_WRITE;
+ dio->fua = dio->write && bio->bi_opf & REQ_FUA;
+ if (unlikely(dio->fua)) {
+ /*
+ * Don't pass down the FUA flag because we have to flush
+ * disk cache anyway.
+ */
+ bio->bi_opf &= ~REQ_FUA;
+ }
+ if (unlikely(dio->range.logical_sector + bio_sectors(bio) > ic->provided_data_sectors)) {
+ DMERR("Too big sector number: 0x%llx + 0x%x > 0x%llx",
+ (unsigned long long)dio->range.logical_sector, bio_sectors(bio),
+ (unsigned long long)ic->provided_data_sectors);
+ return DM_MAPIO_KILL;
+ }
+ if (unlikely((dio->range.logical_sector | bio_sectors(bio)) & (unsigned)(ic->sectors_per_block - 1))) {
+ DMERR("Bio not aligned on %u sectors: 0x%llx, 0x%x",
+ ic->sectors_per_block,
+ (unsigned long long)dio->range.logical_sector, bio_sectors(bio));
+ return DM_MAPIO_KILL;
+ }
+
+ if (ic->sectors_per_block > 1) {
+ struct bvec_iter iter;
+ struct bio_vec bv;
+ bio_for_each_segment(bv, bio, iter) {
+ if (unlikely((bv.bv_offset | bv.bv_len) & ((ic->sectors_per_block << SECTOR_SHIFT) - 1))) {
+ DMERR("Bio vector (%u,%u) is not aligned on %u-sector boundary",
+ bv.bv_offset, bv.bv_len, ic->sectors_per_block);
+ return DM_MAPIO_KILL;
+ }
+ }
+ }
+
+ bip = bio_integrity(bio);
+ if (!ic->internal_hash) {
+ if (bip) {
+ unsigned wanted_tag_size = bio_sectors(bio) >> ic->sb->log2_sectors_per_block;
+ if (ic->log2_tag_size >= 0)
+ wanted_tag_size <<= ic->log2_tag_size;
+ else
+ wanted_tag_size *= ic->tag_size;
+ if (unlikely(wanted_tag_size != bip->bip_iter.bi_size)) {
+ DMERR("Invalid integrity data size %u, expected %u", bip->bip_iter.bi_size, wanted_tag_size);
+ return DM_MAPIO_KILL;
+ }
+ }
+ } else {
+ if (unlikely(bip != NULL)) {
+ DMERR("Unexpected integrity data when using internal hash");
+ return DM_MAPIO_KILL;
+ }
+ }
+
+ if (unlikely(ic->mode == 'R') && unlikely(dio->write))
+ return DM_MAPIO_KILL;
+
+ get_area_and_offset(ic, dio->range.logical_sector, &area, &offset);
+ dio->metadata_block = get_metadata_sector_and_offset(ic, area, offset, &dio->metadata_offset);
+ bio->bi_iter.bi_sector = get_data_sector(ic, area, offset);
+
+ dm_integrity_map_continue(dio, true);
+ return DM_MAPIO_SUBMITTED;
+}
+
+static bool __journal_read_write(struct dm_integrity_io *dio, struct bio *bio,
+ unsigned journal_section, unsigned journal_entry)
+{
+ struct dm_integrity_c *ic = dio->ic;
+ sector_t logical_sector;
+ unsigned n_sectors;
+
+ logical_sector = dio->range.logical_sector;
+ n_sectors = dio->range.n_sectors;
+ do {
+ struct bio_vec bv = bio_iovec(bio);
+ char *mem;
+
+ if (unlikely(bv.bv_len >> SECTOR_SHIFT > n_sectors))
+ bv.bv_len = n_sectors << SECTOR_SHIFT;
+ n_sectors -= bv.bv_len >> SECTOR_SHIFT;
+ bio_advance_iter(bio, &bio->bi_iter, bv.bv_len);
+retry_kmap:
+ mem = kmap_atomic(bv.bv_page);
+ if (likely(dio->write))
+ flush_dcache_page(bv.bv_page);
+
+ do {
+ struct journal_entry *je = access_journal_entry(ic, journal_section, journal_entry);
+
+ if (unlikely(!dio->write)) {
+ struct journal_sector *js;
+ char *mem_ptr;
+ unsigned s;
+
+ if (unlikely(journal_entry_is_inprogress(je))) {
+ flush_dcache_page(bv.bv_page);
+ kunmap_atomic(mem);
+
+ __io_wait_event(ic->copy_to_journal_wait, !journal_entry_is_inprogress(je));
+ goto retry_kmap;
+ }
+ smp_rmb();
+ BUG_ON(journal_entry_get_sector(je) != logical_sector);
+ js = access_journal_data(ic, journal_section, journal_entry);
+ mem_ptr = mem + bv.bv_offset;
+ s = 0;
+ do {
+ memcpy(mem_ptr, js, JOURNAL_SECTOR_DATA);
+ *(commit_id_t *)(mem_ptr + JOURNAL_SECTOR_DATA) = je->last_bytes[s];
+ js++;
+ mem_ptr += 1 << SECTOR_SHIFT;
+ } while (++s < ic->sectors_per_block);
+#ifdef INTERNAL_VERIFY
+ if (ic->internal_hash) {
+ char checksums_onstack[max(crypto_shash_digestsize(ic->internal_hash), ic->tag_size)];
+
+ integrity_sector_checksum(ic, logical_sector, mem + bv.bv_offset, checksums_onstack);
+ if (unlikely(memcmp(checksums_onstack, journal_entry_tag(ic, je), ic->tag_size))) {
+ DMERR("Checksum failed when reading from journal, at sector 0x%llx",
+ (unsigned long long)logical_sector);
+ }
+ }
+#endif
+ }
+
+ if (!ic->internal_hash) {
+ struct bio_integrity_payload *bip = bio_integrity(bio);
+ unsigned tag_todo = ic->tag_size;
+ char *tag_ptr = journal_entry_tag(ic, je);
+
+ if (bip) do {
+ struct bio_vec biv = bvec_iter_bvec(bip->bip_vec, bip->bip_iter);
+ unsigned tag_now = min(biv.bv_len, tag_todo);
+ char *tag_addr;
+ BUG_ON(PageHighMem(biv.bv_page));
+ tag_addr = lowmem_page_address(biv.bv_page) + biv.bv_offset;
+ if (likely(dio->write))
+ memcpy(tag_ptr, tag_addr, tag_now);
+ else
+ memcpy(tag_addr, tag_ptr, tag_now);
+ bvec_iter_advance(bip->bip_vec, &bip->bip_iter, tag_now);
+ tag_ptr += tag_now;
+ tag_todo -= tag_now;
+ } while (unlikely(tag_todo)); else {
+ if (likely(dio->write))
+ memset(tag_ptr, 0, tag_todo);
+ }
+ }
+
+ if (likely(dio->write)) {
+ struct journal_sector *js;
+ unsigned s;
+
+ js = access_journal_data(ic, journal_section, journal_entry);
+ memcpy(js, mem + bv.bv_offset, ic->sectors_per_block << SECTOR_SHIFT);
+
+ s = 0;
+ do {
+ je->last_bytes[s] = js[s].commit_id;
+ } while (++s < ic->sectors_per_block);
+
+ if (ic->internal_hash) {
+ unsigned digest_size = crypto_shash_digestsize(ic->internal_hash);
+ if (unlikely(digest_size > ic->tag_size)) {
+ char checksums_onstack[digest_size];
+ integrity_sector_checksum(ic, logical_sector, (char *)js, checksums_onstack);
+ memcpy(journal_entry_tag(ic, je), checksums_onstack, ic->tag_size);
+ } else
+ integrity_sector_checksum(ic, logical_sector, (char *)js, journal_entry_tag(ic, je));
+ }
+
+ journal_entry_set_sector(je, logical_sector);
+ }
+ logical_sector += ic->sectors_per_block;
+
+ journal_entry++;
+ if (unlikely(journal_entry == ic->journal_section_entries)) {
+ journal_entry = 0;
+ journal_section++;
+ wraparound_section(ic, &journal_section);
+ }
+
+ bv.bv_offset += ic->sectors_per_block << SECTOR_SHIFT;
+ } while (bv.bv_len -= ic->sectors_per_block << SECTOR_SHIFT);
+
+ if (unlikely(!dio->write))
+ flush_dcache_page(bv.bv_page);
+ kunmap_atomic(mem);
+ } while (n_sectors);
+
+ if (likely(dio->write)) {
+ smp_mb();
+ if (unlikely(waitqueue_active(&ic->copy_to_journal_wait)))
+ wake_up(&ic->copy_to_journal_wait);
+ if (ACCESS_ONCE(ic->free_sectors) <= ic->free_sectors_threshold) {
+ queue_work(ic->commit_wq, &ic->commit_work);
+ } else {
+ schedule_autocommit(ic);
+ }
+ } else {
+ remove_range(ic, &dio->range);
+ }
+
+ if (unlikely(bio->bi_iter.bi_size)) {
+ sector_t area, offset;
+
+ dio->range.logical_sector = logical_sector;
+ get_area_and_offset(ic, dio->range.logical_sector, &area, &offset);
+ dio->metadata_block = get_metadata_sector_and_offset(ic, area, offset, &dio->metadata_offset);
+ return true;
+ }
+
+ return false;
+}
+
+static void dm_integrity_map_continue(struct dm_integrity_io *dio, bool from_map)
+{
+ struct dm_integrity_c *ic = dio->ic;
+ struct bio *bio = dm_bio_from_per_bio_data(dio, sizeof(struct dm_integrity_io));
+ unsigned journal_section, journal_entry;
+ unsigned journal_read_pos;
+ struct completion read_comp;
+ bool need_sync_io = ic->internal_hash && !dio->write;
+
+ if (need_sync_io && from_map) {
+ INIT_WORK(&dio->work, integrity_bio_wait);
+ queue_work(ic->metadata_wq, &dio->work);
+ return;
+ }
+
+lock_retry:
+ spin_lock_irq(&ic->endio_wait.lock);
+retry:
+ if (unlikely(dm_integrity_failed(ic))) {
+ spin_unlock_irq(&ic->endio_wait.lock);
+ do_endio(ic, bio);
+ return;
+ }
+ dio->range.n_sectors = bio_sectors(bio);
+ journal_read_pos = NOT_FOUND;
+ if (likely(ic->mode == 'J')) {
+ if (dio->write) {
+ unsigned next_entry, i, pos;
+ unsigned ws, we;
+
+ dio->range.n_sectors = min(dio->range.n_sectors, ic->free_sectors);
+ if (unlikely(!dio->range.n_sectors))
+ goto sleep;
+ ic->free_sectors -= dio->range.n_sectors;
+ journal_section = ic->free_section;
+ journal_entry = ic->free_section_entry;
+
+ next_entry = ic->free_section_entry + dio->range.n_sectors;
+ ic->free_section_entry = next_entry % ic->journal_section_entries;
+ ic->free_section += next_entry / ic->journal_section_entries;
+ ic->n_uncommitted_sections += next_entry / ic->journal_section_entries;
+ wraparound_section(ic, &ic->free_section);
+
+ pos = journal_section * ic->journal_section_entries + journal_entry;
+ ws = journal_section;
+ we = journal_entry;
+ i = 0;
+ do {
+ struct journal_entry *je;
+
+ add_journal_node(ic, &ic->journal_tree[pos], dio->range.logical_sector + i);
+ pos++;
+ if (unlikely(pos >= ic->journal_entries))
+ pos = 0;
+
+ je = access_journal_entry(ic, ws, we);
+ BUG_ON(!journal_entry_is_unused(je));
+ journal_entry_set_inprogress(je);
+ we++;
+ if (unlikely(we == ic->journal_section_entries)) {
+ we = 0;
+ ws++;
+ wraparound_section(ic, &ws);
+ }
+ } while ((i += ic->sectors_per_block) < dio->range.n_sectors);
+
+ spin_unlock_irq(&ic->endio_wait.lock);
+ goto journal_read_write;
+ } else {
+ sector_t next_sector;
+ journal_read_pos = find_journal_node(ic, dio->range.logical_sector, &next_sector);
+ if (likely(journal_read_pos == NOT_FOUND)) {
+ if (unlikely(dio->range.n_sectors > next_sector - dio->range.logical_sector))
+ dio->range.n_sectors = next_sector - dio->range.logical_sector;
+ } else {
+ unsigned i;
+ unsigned jp = journal_read_pos + 1;
+ for (i = ic->sectors_per_block; i < dio->range.n_sectors; i += ic->sectors_per_block, jp++) {
+ if (!test_journal_node(ic, jp, dio->range.logical_sector + i))
+ break;
+ }
+ dio->range.n_sectors = i;
+ }
+ }
+ }
+ if (unlikely(!add_new_range(ic, &dio->range))) {
+ /*
+ * We must not sleep in the request routine because it could
+ * stall bios on current->bio_list.
+ * So, we offload the bio to a workqueue if we have to sleep.
+ */
+sleep:
+ if (from_map) {
+ spin_unlock_irq(&ic->endio_wait.lock);
+ INIT_WORK(&dio->work, integrity_bio_wait);
+ queue_work(ic->wait_wq, &dio->work);
+ return;
+ } else {
+ sleep_on_endio_wait(ic);
+ goto retry;
+ }
+ }
+ spin_unlock_irq(&ic->endio_wait.lock);
+
+ if (unlikely(journal_read_pos != NOT_FOUND)) {
+ journal_section = journal_read_pos / ic->journal_section_entries;
+ journal_entry = journal_read_pos % ic->journal_section_entries;
+ goto journal_read_write;
+ }
+
+ dio->in_flight = (atomic_t)ATOMIC_INIT(2);
+
+ if (need_sync_io) {
+ read_comp = COMPLETION_INITIALIZER_ONSTACK(read_comp);
+ dio->completion = &read_comp;
+ } else
+ dio->completion = NULL;
+
+ dio->orig_bi_iter = bio->bi_iter;
+
+ dio->orig_bi_bdev = bio->bi_bdev;
+ bio->bi_bdev = ic->dev->bdev;
+
+ dio->orig_bi_integrity = bio_integrity(bio);
+ bio->bi_integrity = NULL;
+ bio->bi_opf &= ~REQ_INTEGRITY;
+
+ dio->orig_bi_end_io = bio->bi_end_io;
+ bio->bi_end_io = integrity_end_io;
+
+ bio->bi_iter.bi_size = dio->range.n_sectors << SECTOR_SHIFT;
+ bio->bi_iter.bi_sector += ic->start;
+ generic_make_request(bio);
+
+ if (need_sync_io) {
+ wait_for_completion_io(&read_comp);
+ integrity_metadata(&dio->work);
+ } else {
+ INIT_WORK(&dio->work, integrity_metadata);
+ queue_work(ic->metadata_wq, &dio->work);
+ }
+
+ return;
+
+journal_read_write:
+ if (unlikely(__journal_read_write(dio, bio, journal_section, journal_entry)))
+ goto lock_retry;
+
+ do_endio_flush(ic, dio);
+}
+
+
+static void integrity_bio_wait(struct work_struct *w)
+{
+ struct dm_integrity_io *dio = container_of(w, struct dm_integrity_io, work);
+
+ dm_integrity_map_continue(dio, false);
+}
+
+static void pad_uncommitted(struct dm_integrity_c *ic)
+{
+ if (ic->free_section_entry) {
+ ic->free_sectors -= ic->journal_section_entries - ic->free_section_entry;
+ ic->free_section_entry = 0;
+ ic->free_section++;
+ wraparound_section(ic, &ic->free_section);
+ ic->n_uncommitted_sections++;
+ }
+}
+
+static void integrity_commit(struct work_struct *w)
+{
+ struct dm_integrity_c *ic = container_of(w, struct dm_integrity_c, commit_work);
+ unsigned commit_start, commit_sections;
+ unsigned i, j, n;
+ struct bio *flushes;
+
+ del_timer(&ic->autocommit_timer);
+
+ spin_lock_irq(&ic->endio_wait.lock);
+ flushes = bio_list_get(&ic->flush_bio_list);
+ if (unlikely(ic->mode != 'J')) {
+ spin_unlock_irq(&ic->endio_wait.lock);
+ dm_integrity_flush_buffers(ic);
+ goto release_flush_bios;
+ }
+
+ pad_uncommitted(ic);
+ commit_start = ic->uncommitted_section;
+ commit_sections = ic->n_uncommitted_sections;
+ spin_unlock_irq(&ic->endio_wait.lock);
+
+ if (!commit_sections)
+ goto release_flush_bios;
+
+ i = commit_start;
+ for (n = 0; n < commit_sections; n++) {
+ for (j = 0; j < ic->journal_section_entries; j++) {
+ struct journal_entry *je;
+ je = access_journal_entry(ic, i, j);
+ io_wait_event(ic->copy_to_journal_wait, !journal_entry_is_inprogress(je));
+ }
+ for (j = 0; j < ic->journal_section_sectors; j++) {
+ struct journal_sector *js;
+ js = access_journal(ic, i, j);
+ js->commit_id = dm_integrity_commit_id(ic, i, j, ic->commit_seq);
+ }
+ i++;
+ if (unlikely(i >= ic->journal_sections))
+ ic->commit_seq = next_commit_seq(ic->commit_seq);
+ wraparound_section(ic, &i);
+ }
+ smp_rmb();
+
+ write_journal(ic, commit_start, commit_sections);
+
+ spin_lock_irq(&ic->endio_wait.lock);
+ ic->uncommitted_section += commit_sections;
+ wraparound_section(ic, &ic->uncommitted_section);
+ ic->n_uncommitted_sections -= commit_sections;
+ ic->n_committed_sections += commit_sections;
+ spin_unlock_irq(&ic->endio_wait.lock);
+
+ if (ACCESS_ONCE(ic->free_sectors) <= ic->free_sectors_threshold)
+ queue_work(ic->writer_wq, &ic->writer_work);
+
+release_flush_bios:
+ while (flushes) {
+ struct bio *next = flushes->bi_next;
+ flushes->bi_next = NULL;
+ do_endio(ic, flushes);
+ flushes = next;
+ }
+}
+
+static void complete_copy_from_journal(unsigned long error, void *context)
+{
+ struct journal_io *io = context;
+ struct journal_completion *comp = io->comp;
+ struct dm_integrity_c *ic = comp->ic;
+ remove_range(ic, &io->range);
+ mempool_free(io, ic->journal_io_mempool);
+ if (unlikely(error != 0))
+ dm_integrity_io_error(ic, "copying from journal", -EIO);
+ complete_journal_op(comp);
+}
+
+static void restore_last_bytes(struct dm_integrity_c *ic, struct journal_sector *js,
+ struct journal_entry *je)
+{
+ unsigned s = 0;
+ do {
+ js->commit_id = je->last_bytes[s];
+ js++;
+ } while (++s < ic->sectors_per_block);
+}
+
+static void do_journal_write(struct dm_integrity_c *ic, unsigned write_start,
+ unsigned write_sections, bool from_replay)
+{
+ unsigned i, j, n;
+ struct journal_completion comp;
+
+ comp.ic = ic;
+ comp.in_flight = (atomic_t)ATOMIC_INIT(1);
+ comp.comp = COMPLETION_INITIALIZER_ONSTACK(comp.comp);
+
+ i = write_start;
+ for (n = 0; n < write_sections; n++, i++, wraparound_section(ic, &i)) {
+#ifndef INTERNAL_VERIFY
+ if (unlikely(from_replay))
+#endif
+ rw_section_mac(ic, i, false);
+ for (j = 0; j < ic->journal_section_entries; j++) {
+ struct journal_entry *je = access_journal_entry(ic, i, j);
+ sector_t sec, area, offset;
+ unsigned k, l, next_loop;
+ sector_t metadata_block;
+ unsigned metadata_offset;
+ struct journal_io *io;
+
+ if (journal_entry_is_unused(je))
+ continue;
+ BUG_ON(unlikely(journal_entry_is_inprogress(je)) && !from_replay);
+ sec = journal_entry_get_sector(je);
+ if (unlikely(from_replay)) {
+ if (unlikely(sec & (unsigned)(ic->sectors_per_block - 1))) {
+ dm_integrity_io_error(ic, "invalid sector in journal", -EIO);
+ sec &= ~(sector_t)(ic->sectors_per_block - 1);
+ }
+ }
+ get_area_and_offset(ic, sec, &area, &offset);
+ restore_last_bytes(ic, access_journal_data(ic, i, j), je);
+ for (k = j + 1; k < ic->journal_section_entries; k++) {
+ struct journal_entry *je2 = access_journal_entry(ic, i, k);
+ sector_t sec2, area2, offset2;
+ if (journal_entry_is_unused(je2))
+ break;
+ BUG_ON(unlikely(journal_entry_is_inprogress(je2)) && !from_replay);
+ sec2 = journal_entry_get_sector(je2);
+ get_area_and_offset(ic, sec2, &area2, &offset2);
+ if (area2 != area || offset2 != offset + ((k - j) << ic->sb->log2_sectors_per_block))
+ break;
+ restore_last_bytes(ic, access_journal_data(ic, i, k), je2);
+ }
+ next_loop = k - 1;
+
+ io = mempool_alloc(ic->journal_io_mempool, GFP_NOIO);
+ io->comp = &comp;
+ io->range.logical_sector = sec;
+ io->range.n_sectors = (k - j) << ic->sb->log2_sectors_per_block;
+
+ spin_lock_irq(&ic->endio_wait.lock);
+ while (unlikely(!add_new_range(ic, &io->range)))
+ sleep_on_endio_wait(ic);
+
+ if (likely(!from_replay)) {
+ struct journal_node *section_node = &ic->journal_tree[i * ic->journal_section_entries];
+
+ /* don't write if there is newer committed sector */
+ while (j < k && find_newer_committed_node(ic, &section_node[j])) {
+ struct journal_entry *je2 = access_journal_entry(ic, i, j);
+
+ journal_entry_set_unused(je2);
+ remove_journal_node(ic, &section_node[j]);
+ j++;
+ sec += ic->sectors_per_block;
+ offset += ic->sectors_per_block;
+ }
+ while (j < k && find_newer_committed_node(ic, &section_node[k - 1])) {
+ struct journal_entry *je2 = access_journal_entry(ic, i, k - 1);
+
+ journal_entry_set_unused(je2);
+ remove_journal_node(ic, &section_node[k - 1]);
+ k--;
+ }
+ if (j == k) {
+ remove_range_unlocked(ic, &io->range);
+ spin_unlock_irq(&ic->endio_wait.lock);
+ mempool_free(io, ic->journal_io_mempool);
+ goto skip_io;
+ }
+ for (l = j; l < k; l++) {
+ remove_journal_node(ic, &section_node[l]);
+ }
+ }
+ spin_unlock_irq(&ic->endio_wait.lock);
+
+ metadata_block = get_metadata_sector_and_offset(ic, area, offset, &metadata_offset);
+ for (l = j; l < k; l++) {
+ int r;
+ struct journal_entry *je2 = access_journal_entry(ic, i, l);
+
+ if (
+#ifndef INTERNAL_VERIFY
+ unlikely(from_replay) &&
+#endif
+ ic->internal_hash) {
+ char test_tag[max(crypto_shash_digestsize(ic->internal_hash), ic->tag_size)];
+
+ integrity_sector_checksum(ic, sec + ((l - j) << ic->sb->log2_sectors_per_block),
+ (char *)access_journal_data(ic, i, l), test_tag);
+ if (unlikely(memcmp(test_tag, journal_entry_tag(ic, je2), ic->tag_size)))
+ dm_integrity_io_error(ic, "tag mismatch when replaying journal", -EILSEQ);
+ }
+
+ journal_entry_set_unused(je2);
+ r = dm_integrity_rw_tag(ic, journal_entry_tag(ic, je2), &metadata_block, &metadata_offset,
+ ic->tag_size, TAG_WRITE);
+ if (unlikely(r)) {
+ dm_integrity_io_error(ic, "reading tags", r);
+ }
+ }
+
+ atomic_inc(&comp.in_flight);
+ copy_from_journal(ic, i, j << ic->sb->log2_sectors_per_block,
+ (k - j) << ic->sb->log2_sectors_per_block,
+ get_data_sector(ic, area, offset),
+ complete_copy_from_journal, io);
+skip_io:
+ j = next_loop;
+ }
+ }
+
+ dm_bufio_write_dirty_buffers_async(ic->bufio);
+
+ complete_journal_op(&comp);
+ wait_for_completion_io(&comp.comp);
+
+ dm_integrity_flush_buffers(ic);
+}
+
+static void integrity_writer(struct work_struct *w)
+{
+ struct dm_integrity_c *ic = container_of(w, struct dm_integrity_c, writer_work);
+ unsigned write_start, write_sections;
+
+ unsigned prev_free_sectors;
+
+ /* the following test is not needed, but it tests the replay code */
+ if (ACCESS_ONCE(ic->suspending))
+ return;
+
+ spin_lock_irq(&ic->endio_wait.lock);
+ write_start = ic->committed_section;
+ write_sections = ic->n_committed_sections;
+ spin_unlock_irq(&ic->endio_wait.lock);
+
+ if (!write_sections)
+ return;
+
+ do_journal_write(ic, write_start, write_sections, false);
+
+ spin_lock_irq(&ic->endio_wait.lock);
+
+ ic->committed_section += write_sections;
+ wraparound_section(ic, &ic->committed_section);
+ ic->n_committed_sections -= write_sections;
+
+ prev_free_sectors = ic->free_sectors;
+ ic->free_sectors += write_sections * ic->journal_section_entries;
+ if (unlikely(!prev_free_sectors))
+ wake_up_locked(&ic->endio_wait);
+
+ spin_unlock_irq(&ic->endio_wait.lock);
+}
+
+static void init_journal(struct dm_integrity_c *ic, unsigned start_section,
+ unsigned n_sections, unsigned char commit_seq)
+{
+ unsigned i, j, n;
+
+ if (!n_sections)
+ return;
+
+ for (n = 0; n < n_sections; n++) {
+ i = start_section + n;
+ wraparound_section(ic, &i);
+ for (j = 0; j < ic->journal_section_sectors; j++) {
+ struct journal_sector *js = access_journal(ic, i, j);
+ memset(&js->entries, 0, JOURNAL_SECTOR_DATA);
+ js->commit_id = dm_integrity_commit_id(ic, i, j, commit_seq);
+ }
+ for (j = 0; j < ic->journal_section_entries; j++) {
+ struct journal_entry *je = access_journal_entry(ic, i, j);
+ journal_entry_set_unused(je);
+ }
+ }
+
+ write_journal(ic, start_section, n_sections);
+}
+
+static int find_commit_seq(struct dm_integrity_c *ic, unsigned i, unsigned j, commit_id_t id)
+{
+ unsigned char k;
+ for (k = 0; k < N_COMMIT_IDS; k++) {
+ if (dm_integrity_commit_id(ic, i, j, k) == id)
+ return k;
+ }
+ dm_integrity_io_error(ic, "journal commit id", -EIO);
+ return -EIO;
+}
+
+static void replay_journal(struct dm_integrity_c *ic)
+{
+ unsigned i, j;
+ bool used_commit_ids[N_COMMIT_IDS];
+ unsigned max_commit_id_sections[N_COMMIT_IDS];
+ unsigned write_start, write_sections;
+ unsigned continue_section;
+ bool journal_empty;
+ unsigned char unused, last_used, want_commit_seq;
+
+ if (ic->mode == 'R')
+ return;
+
+ if (ic->journal_uptodate)
+ return;
+
+ last_used = 0;
+ write_start = 0;
+
+ if (!ic->just_formatted) {
+ DEBUG_print("reading journal\n");
+ rw_journal(ic, REQ_OP_READ, 0, 0, ic->journal_sections, NULL);
+ if (ic->journal_io)
+ DEBUG_bytes(lowmem_page_address(ic->journal_io[0].page), 64, "read journal");
+ if (ic->journal_io) {
+ struct journal_completion crypt_comp;
+ crypt_comp.ic = ic;
+ crypt_comp.comp = COMPLETION_INITIALIZER_ONSTACK(crypt_comp.comp);
+ crypt_comp.in_flight = (atomic_t)ATOMIC_INIT(0);
+ encrypt_journal(ic, false, 0, ic->journal_sections, &crypt_comp);
+ wait_for_completion(&crypt_comp.comp);
+ }
+ DEBUG_bytes(lowmem_page_address(ic->journal[0].page), 64, "decrypted journal");
+ }
+
+ if (dm_integrity_failed(ic))
+ goto clear_journal;
+
+ journal_empty = true;
+ memset(used_commit_ids, 0, sizeof used_commit_ids);
+ memset(max_commit_id_sections, 0, sizeof max_commit_id_sections);
+ for (i = 0; i < ic->journal_sections; i++) {
+ for (j = 0; j < ic->journal_section_sectors; j++) {
+ int k;
+ struct journal_sector *js = access_journal(ic, i, j);
+ k = find_commit_seq(ic, i, j, js->commit_id);
+ if (k < 0)
+ goto clear_journal;
+ used_commit_ids[k] = true;
+ max_commit_id_sections[k] = i;
+ }
+ if (journal_empty) {
+ for (j = 0; j < ic->journal_section_entries; j++) {
+ struct journal_entry *je = access_journal_entry(ic, i, j);
+ if (!journal_entry_is_unused(je)) {
+ journal_empty = false;
+ break;
+ }
+ }
+ }
+ }
+
+ if (!used_commit_ids[N_COMMIT_IDS - 1]) {
+ unused = N_COMMIT_IDS - 1;
+ while (unused && !used_commit_ids[unused - 1])
+ unused--;
+ } else {
+ for (unused = 0; unused < N_COMMIT_IDS; unused++)
+ if (!used_commit_ids[unused])
+ break;
+ if (unused == N_COMMIT_IDS) {
+ dm_integrity_io_error(ic, "journal commit ids", -EIO);
+ goto clear_journal;
+ }
+ }
+ DEBUG_print("first unused commit seq %d [%d,%d,%d,%d]\n",
+ unused, used_commit_ids[0], used_commit_ids[1],
+ used_commit_ids[2], used_commit_ids[3]);
+
+ last_used = prev_commit_seq(unused);
+ want_commit_seq = prev_commit_seq(last_used);
+
+ if (!used_commit_ids[want_commit_seq] && used_commit_ids[prev_commit_seq(want_commit_seq)])
+ journal_empty = true;
+
+ write_start = max_commit_id_sections[last_used] + 1;
+ if (unlikely(write_start >= ic->journal_sections))
+ want_commit_seq = next_commit_seq(want_commit_seq);
+ wraparound_section(ic, &write_start);
+
+ i = write_start;
+ for (write_sections = 0; write_sections < ic->journal_sections; write_sections++) {
+ for (j = 0; j < ic->journal_section_sectors; j++) {
+ struct journal_sector *js = access_journal(ic, i, j);
+
+ if (js->commit_id != dm_integrity_commit_id(ic, i, j, want_commit_seq)) {
+ /*
+ * This could be caused by crash during writing.
+ * We won't replay the inconsistent part of the
+ * journal.
+ */
+ DEBUG_print("commit id mismatch at position (%u, %u): %d != %d\n",
+ i, j, find_commit_seq(ic, i, j, js->commit_id), want_commit_seq);
+ goto brk;
+ }
+ }
+ i++;
+ if (unlikely(i >= ic->journal_sections))
+ want_commit_seq = next_commit_seq(want_commit_seq);
+ wraparound_section(ic, &i);
+ }
+brk:
+
+ if (!journal_empty) {
+ DEBUG_print("replaying %u sections, starting at %u, commit seq %d\n",
+ write_sections, write_start, want_commit_seq);
+ do_journal_write(ic, write_start, write_sections, true);
+ }
+
+ if (write_sections == ic->journal_sections && (ic->mode == 'J' || journal_empty)) {
+ continue_section = write_start;
+ ic->commit_seq = want_commit_seq;
+ DEBUG_print("continuing from section %u, commit seq %d\n", write_start, ic->commit_seq);
+ } else {
+ unsigned s;
+ unsigned char erase_seq;
+clear_journal:
+ DEBUG_print("clearing journal\n");
+
+ erase_seq = prev_commit_seq(prev_commit_seq(last_used));
+ s = write_start;
+ init_journal(ic, s, 1, erase_seq);
+ s++;
+ wraparound_section(ic, &s);
+ if (ic->journal_sections >= 2) {
+ init_journal(ic, s, ic->journal_sections - 2, erase_seq);
+ s += ic->journal_sections - 2;
+ wraparound_section(ic, &s);
+ init_journal(ic, s, 1, erase_seq);
+ }
+
+ continue_section = 0;
+ ic->commit_seq = next_commit_seq(erase_seq);
+ }
+
+ ic->committed_section = continue_section;
+ ic->n_committed_sections = 0;
+
+ ic->uncommitted_section = continue_section;
+ ic->n_uncommitted_sections = 0;
+
+ ic->free_section = continue_section;
+ ic->free_section_entry = 0;
+ ic->free_sectors = ic->journal_entries;
+
+ ic->journal_tree_root = RB_ROOT;
+ for (i = 0; i < ic->journal_entries; i++)
+ init_journal_node(&ic->journal_tree[i]);
+}
+
+static void dm_integrity_postsuspend(struct dm_target *ti)
+{
+ struct dm_integrity_c *ic = (struct dm_integrity_c *)ti->private;
+
+ del_timer_sync(&ic->autocommit_timer);
+
+ ic->suspending = true;
+
+ queue_work(ic->commit_wq, &ic->commit_work);
+ drain_workqueue(ic->commit_wq);
+
+ if (ic->mode == 'J') {
+ drain_workqueue(ic->writer_wq);
+ dm_integrity_flush_buffers(ic);
+ }
+
+ ic->suspending = false;
+
+ BUG_ON(!RB_EMPTY_ROOT(&ic->in_progress));
+
+ ic->journal_uptodate = true;
+}
+
+static void dm_integrity_resume(struct dm_target *ti)
+{
+ struct dm_integrity_c *ic = (struct dm_integrity_c *)ti->private;
+
+ replay_journal(ic);
+}
+
+static void dm_integrity_status(struct dm_target *ti, status_type_t type,
+ unsigned status_flags, char *result, unsigned maxlen)
+{
+ struct dm_integrity_c *ic = (struct dm_integrity_c *)ti->private;
+ unsigned arg_count;
+ size_t sz = 0;
+
+ switch (type) {
+ case STATUSTYPE_INFO:
+ result[0] = '\0';
+ break;
+
+ case STATUSTYPE_TABLE: {
+ __u64 watermark_percentage = (__u64)(ic->journal_entries - ic->free_sectors_threshold) * 100;
+ watermark_percentage += ic->journal_entries / 2;
+ do_div(watermark_percentage, ic->journal_entries);
+ arg_count = 5;
+ arg_count += ic->sectors_per_block != 1;
+ arg_count += !!ic->internal_hash_alg.alg_string;
+ arg_count += !!ic->journal_crypt_alg.alg_string;
+ arg_count += !!ic->journal_mac_alg.alg_string;
+ DMEMIT("%s %llu %u %c %u", ic->dev->name, (unsigned long long)ic->start,
+ ic->tag_size, ic->mode, arg_count);
+ DMEMIT(" journal_sectors:%u", ic->initial_sectors - SB_SECTORS);
+ DMEMIT(" interleave_sectors:%u", 1U << ic->sb->log2_interleave_sectors);
+ DMEMIT(" buffer_sectors:%u", 1U << ic->log2_buffer_sectors);
+ DMEMIT(" journal_watermark:%u", (unsigned)watermark_percentage);
+ DMEMIT(" commit_time:%u", ic->autocommit_msec);
+ if (ic->sectors_per_block != 1)
+ DMEMIT(" block_size:%u", ic->sectors_per_block << SECTOR_SHIFT);
+
+#define EMIT_ALG(a, n) \
+ do { \
+ if (ic->a.alg_string) { \
+ DMEMIT(" %s:%s", n, ic->a.alg_string); \
+ if (ic->a.key_string) \
+ DMEMIT(":%s", ic->a.key_string);\
+ } \
+ } while (0)
+ EMIT_ALG(internal_hash_alg, "internal_hash");
+ EMIT_ALG(journal_crypt_alg, "journal_crypt");
+ EMIT_ALG(journal_mac_alg, "journal_mac");
+ break;
+ }
+ }
+}
+
+static int dm_integrity_iterate_devices(struct dm_target *ti,
+ iterate_devices_callout_fn fn, void *data)
+{
+ struct dm_integrity_c *ic = ti->private;
+
+ return fn(ti, ic->dev, ic->start + ic->initial_sectors + ic->metadata_run, ti->len, data);
+}
+
+static void dm_integrity_io_hints(struct dm_target *ti, struct queue_limits *limits)
+{
+ struct dm_integrity_c *ic = ti->private;
+
+ if (ic->sectors_per_block > 1) {
+ limits->logical_block_size = ic->sectors_per_block << SECTOR_SHIFT;
+ limits->physical_block_size = ic->sectors_per_block << SECTOR_SHIFT;
+ blk_limits_io_min(limits, ic->sectors_per_block << SECTOR_SHIFT);
+ }
+}
+
+static void calculate_journal_section_size(struct dm_integrity_c *ic)
+{
+ unsigned sector_space = JOURNAL_SECTOR_DATA;
+
+ ic->journal_sections = le32_to_cpu(ic->sb->journal_sections);
+ ic->journal_entry_size = roundup(offsetof(struct journal_entry, last_bytes[ic->sectors_per_block]) + ic->tag_size,
+ JOURNAL_ENTRY_ROUNDUP);
+
+ if (ic->sb->flags & cpu_to_le32(SB_FLAG_HAVE_JOURNAL_MAC))
+ sector_space -= JOURNAL_MAC_PER_SECTOR;
+ ic->journal_entries_per_sector = sector_space / ic->journal_entry_size;
+ ic->journal_section_entries = ic->journal_entries_per_sector * JOURNAL_BLOCK_SECTORS;
+ ic->journal_section_sectors = (ic->journal_section_entries << ic->sb->log2_sectors_per_block) + JOURNAL_BLOCK_SECTORS;
+ ic->journal_entries = ic->journal_section_entries * ic->journal_sections;
+}
+
+static int calculate_device_limits(struct dm_integrity_c *ic)
+{
+ __u64 initial_sectors;
+ sector_t last_sector, last_area, last_offset;
+
+ calculate_journal_section_size(ic);
+ initial_sectors = SB_SECTORS + (__u64)ic->journal_section_sectors * ic->journal_sections;
+ if (initial_sectors + METADATA_PADDING_SECTORS >= ic->device_sectors || initial_sectors > UINT_MAX)
+ return -EINVAL;
+ ic->initial_sectors = initial_sectors;
+
+ ic->metadata_run = roundup((__u64)ic->tag_size << (ic->sb->log2_interleave_sectors - ic->sb->log2_sectors_per_block),
+ (__u64)(1 << SECTOR_SHIFT << METADATA_PADDING_SECTORS)) >> SECTOR_SHIFT;
+ if (!(ic->metadata_run & (ic->metadata_run - 1)))
+ ic->log2_metadata_run = __ffs(ic->metadata_run);
+ else
+ ic->log2_metadata_run = -1;
+
+ get_area_and_offset(ic, ic->provided_data_sectors - 1, &last_area, &last_offset);
+ last_sector = get_data_sector(ic, last_area, last_offset);
+
+ if (ic->start + last_sector < last_sector || ic->start + last_sector >= ic->device_sectors)
+ return -EINVAL;
+
+ return 0;
+}
+
+static int initialize_superblock(struct dm_integrity_c *ic, unsigned journal_sectors, unsigned interleave_sectors)
+{
+ unsigned journal_sections;
+ int test_bit;
+
+ memset(ic->sb, 0, SB_SECTORS << SECTOR_SHIFT);
+ memcpy(ic->sb->magic, SB_MAGIC, 8);
+ ic->sb->version = SB_VERSION;
+ ic->sb->integrity_tag_size = cpu_to_le16(ic->tag_size);
+ ic->sb->log2_sectors_per_block = __ffs(ic->sectors_per_block);
+ if (ic->journal_mac_alg.alg_string)
+ ic->sb->flags |= cpu_to_le32(SB_FLAG_HAVE_JOURNAL_MAC);
+
+ calculate_journal_section_size(ic);
+ journal_sections = journal_sectors / ic->journal_section_sectors;
+ if (!journal_sections)
+ journal_sections = 1;
+ ic->sb->journal_sections = cpu_to_le32(journal_sections);
+
+ if (!interleave_sectors)
+ interleave_sectors = DEFAULT_INTERLEAVE_SECTORS;
+ ic->sb->log2_interleave_sectors = __fls(interleave_sectors);
+ ic->sb->log2_interleave_sectors = max((__u8)MIN_LOG2_INTERLEAVE_SECTORS, ic->sb->log2_interleave_sectors);
+ ic->sb->log2_interleave_sectors = min((__u8)MAX_LOG2_INTERLEAVE_SECTORS, ic->sb->log2_interleave_sectors);
+
+ ic->provided_data_sectors = 0;
+ for (test_bit = fls64(ic->device_sectors) - 1; test_bit >= 3; test_bit--) {
+ __u64 prev_data_sectors = ic->provided_data_sectors;
+
+ ic->provided_data_sectors |= (sector_t)1 << test_bit;
+ if (calculate_device_limits(ic))
+ ic->provided_data_sectors = prev_data_sectors;
+ }
+
+ if (!ic->provided_data_sectors)
+ return -EINVAL;
+
+ ic->sb->provided_data_sectors = cpu_to_le64(ic->provided_data_sectors);
+
+ return 0;
+}
+
+static void dm_integrity_set(struct dm_target *ti, struct dm_integrity_c *ic)
+{
+ struct gendisk *disk = dm_disk(dm_table_get_md(ti->table));
+ struct blk_integrity bi;
+
+ memset(&bi, 0, sizeof(bi));
+ bi.profile = &dm_integrity_profile;
+ bi.tuple_size = ic->tag_size;
+ bi.tag_size = bi.tuple_size;
+ bi.interval_exp = ic->sb->log2_sectors_per_block + SECTOR_SHIFT;
+
+ blk_integrity_register(disk, &bi);
+ blk_queue_max_integrity_segments(disk->queue, UINT_MAX);
+}
+
+static void dm_integrity_free_page_list(struct dm_integrity_c *ic, struct page_list *pl)
+{
+ unsigned i;
+
+ if (!pl)
+ return;
+ for (i = 0; i < ic->journal_pages; i++)
+ if (pl[i].page)
+ __free_page(pl[i].page);
+ kvfree(pl);
+}
+
+static struct page_list *dm_integrity_alloc_page_list(struct dm_integrity_c *ic)
+{
+ size_t page_list_desc_size = ic->journal_pages * sizeof(struct page_list);
+ struct page_list *pl;
+ unsigned i;
+
+ pl = kvmalloc(page_list_desc_size, GFP_KERNEL | __GFP_ZERO);
+ if (!pl)
+ return NULL;
+
+ for (i = 0; i < ic->journal_pages; i++) {
+ pl[i].page = alloc_page(GFP_KERNEL);
+ if (!pl[i].page) {
+ dm_integrity_free_page_list(ic, pl);
+ return NULL;
+ }
+ if (i)
+ pl[i - 1].next = &pl[i];
+ }
+
+ return pl;
+}
+
+static void dm_integrity_free_journal_scatterlist(struct dm_integrity_c *ic, struct scatterlist **sl)
+{
+ unsigned i;
+ for (i = 0; i < ic->journal_sections; i++)
+ kvfree(sl[i]);
+ kfree(sl);
+}
+
+static struct scatterlist **dm_integrity_alloc_journal_scatterlist(struct dm_integrity_c *ic, struct page_list *pl)
+{
+ struct scatterlist **sl;
+ unsigned i;
+
+ sl = kvmalloc(ic->journal_sections * sizeof(struct scatterlist *), GFP_KERNEL | __GFP_ZERO);
+ if (!sl)
+ return NULL;
+
+ for (i = 0; i < ic->journal_sections; i++) {
+ struct scatterlist *s;
+ unsigned start_index, start_offset;
+ unsigned end_index, end_offset;
+ unsigned n_pages;
+ unsigned idx;
+
+ page_list_location(ic, i, 0, &start_index, &start_offset);
+ page_list_location(ic, i, ic->journal_section_sectors - 1, &end_index, &end_offset);
+
+ n_pages = (end_index - start_index + 1);
+
+ s = kvmalloc(n_pages * sizeof(struct scatterlist), GFP_KERNEL);
+ if (!s) {
+ dm_integrity_free_journal_scatterlist(ic, sl);
+ return NULL;
+ }
+
+ sg_init_table(s, n_pages);
+ for (idx = start_index; idx <= end_index; idx++) {
+ char *va = lowmem_page_address(pl[idx].page);
+ unsigned start = 0, end = PAGE_SIZE;
+ if (idx == start_index)
+ start = start_offset;
+ if (idx == end_index)
+ end = end_offset + (1 << SECTOR_SHIFT);
+ sg_set_buf(&s[idx - start_index], va + start, end - start);
+ }
+
+ sl[i] = s;
+ }
+
+ return sl;
+}
+
+static void free_alg(struct alg_spec *a)
+{
+ kzfree(a->alg_string);
+ kzfree(a->key);
+ memset(a, 0, sizeof *a);
+}
+
+static int get_alg_and_key(const char *arg, struct alg_spec *a, char **error, char *error_inval)
+{
+ char *k;
+
+ free_alg(a);
+
+ a->alg_string = kstrdup(strchr(arg, ':') + 1, GFP_KERNEL);
+ if (!a->alg_string)
+ goto nomem;
+
+ k = strchr(a->alg_string, ':');
+ if (k) {
+ *k = 0;
+ a->key_string = k + 1;
+ if (strlen(a->key_string) & 1)
+ goto inval;
+
+ a->key_size = strlen(a->key_string) / 2;
+ a->key = kmalloc(a->key_size, GFP_KERNEL);
+ if (!a->key)
+ goto nomem;
+ if (hex2bin(a->key, a->key_string, a->key_size))
+ goto inval;
+ }
+
+ return 0;
+inval:
+ *error = error_inval;
+ return -EINVAL;
+nomem:
+ *error = "Out of memory for an argument";
+ return -ENOMEM;
+}
+
+static int get_mac(struct crypto_shash **hash, struct alg_spec *a, char **error,
+ char *error_alg, char *error_key)
+{
+ int r;
+
+ if (a->alg_string) {
+ *hash = crypto_alloc_shash(a->alg_string, 0, CRYPTO_ALG_ASYNC);
+ if (IS_ERR(*hash)) {
+ *error = error_alg;
+ r = PTR_ERR(*hash);
+ *hash = NULL;
+ return r;
+ }
+
+ if (a->key) {
+ r = crypto_shash_setkey(*hash, a->key, a->key_size);
+ if (r) {
+ *error = error_key;
+ return r;
+ }
+ }
+ }
+
+ return 0;
+}
+
+static int create_journal(struct dm_integrity_c *ic, char **error)
+{
+ int r = 0;
+ unsigned i;
+ __u64 journal_pages, journal_desc_size, journal_tree_size;
+ unsigned char *crypt_data = NULL;
+
+ ic->commit_ids[0] = cpu_to_le64(0x1111111111111111ULL);
+ ic->commit_ids[1] = cpu_to_le64(0x2222222222222222ULL);
+ ic->commit_ids[2] = cpu_to_le64(0x3333333333333333ULL);
+ ic->commit_ids[3] = cpu_to_le64(0x4444444444444444ULL);
+
+ journal_pages = roundup((__u64)ic->journal_sections * ic->journal_section_sectors,
+ PAGE_SIZE >> SECTOR_SHIFT) >> (PAGE_SHIFT - SECTOR_SHIFT);
+ journal_desc_size = journal_pages * sizeof(struct page_list);
+ if (journal_pages >= totalram_pages - totalhigh_pages || journal_desc_size > ULONG_MAX) {
+ *error = "Journal doesn't fit into memory";
+ r = -ENOMEM;
+ goto bad;
+ }
+ ic->journal_pages = journal_pages;
+
+ ic->journal = dm_integrity_alloc_page_list(ic);
+ if (!ic->journal) {
+ *error = "Could not allocate memory for journal";
+ r = -ENOMEM;
+ goto bad;
+ }
+ if (ic->journal_crypt_alg.alg_string) {
+ unsigned ivsize, blocksize;
+ struct journal_completion comp;
+
+ comp.ic = ic;
+ ic->journal_crypt = crypto_alloc_skcipher(ic->journal_crypt_alg.alg_string, 0, 0);
+ if (IS_ERR(ic->journal_crypt)) {
+ *error = "Invalid journal cipher";
+ r = PTR_ERR(ic->journal_crypt);
+ ic->journal_crypt = NULL;
+ goto bad;
+ }
+ ivsize = crypto_skcipher_ivsize(ic->journal_crypt);
+ blocksize = crypto_skcipher_blocksize(ic->journal_crypt);
+
+ if (ic->journal_crypt_alg.key) {
+ r = crypto_skcipher_setkey(ic->journal_crypt, ic->journal_crypt_alg.key,
+ ic->journal_crypt_alg.key_size);
+ if (r) {
+ *error = "Error setting encryption key";
+ goto bad;
+ }
+ }
+ DEBUG_print("cipher %s, block size %u iv size %u\n",
+ ic->journal_crypt_alg.alg_string, blocksize, ivsize);
+
+ ic->journal_io = dm_integrity_alloc_page_list(ic);
+ if (!ic->journal_io) {
+ *error = "Could not allocate memory for journal io";
+ r = -ENOMEM;
+ goto bad;
+ }
+
+ if (blocksize == 1) {
+ struct scatterlist *sg;
+ SKCIPHER_REQUEST_ON_STACK(req, ic->journal_crypt);
+ unsigned char iv[ivsize];
+ skcipher_request_set_tfm(req, ic->journal_crypt);
+
+ ic->journal_xor = dm_integrity_alloc_page_list(ic);
+ if (!ic->journal_xor) {
+ *error = "Could not allocate memory for journal xor";
+ r = -ENOMEM;
+ goto bad;
+ }
+
+ sg = kvmalloc((ic->journal_pages + 1) * sizeof(struct scatterlist), GFP_KERNEL);
+ if (!sg) {
+ *error = "Unable to allocate sg list";
+ r = -ENOMEM;
+ goto bad;
+ }
+ sg_init_table(sg, ic->journal_pages + 1);
+ for (i = 0; i < ic->journal_pages; i++) {
+ char *va = lowmem_page_address(ic->journal_xor[i].page);
+ clear_page(va);
+ sg_set_buf(&sg[i], va, PAGE_SIZE);
+ }
+ sg_set_buf(&sg[i], &ic->commit_ids, sizeof ic->commit_ids);
+ memset(iv, 0x00, ivsize);
+
+ skcipher_request_set_crypt(req, sg, sg, PAGE_SIZE * ic->journal_pages + sizeof ic->commit_ids, iv);
+ comp.comp = COMPLETION_INITIALIZER_ONSTACK(comp.comp);
+ comp.in_flight = (atomic_t)ATOMIC_INIT(1);
+ if (do_crypt(true, req, &comp))
+ wait_for_completion(&comp.comp);
+ kvfree(sg);
+ r = dm_integrity_failed(ic);
+ if (r) {
+ *error = "Unable to encrypt journal";
+ goto bad;
+ }
+ DEBUG_bytes(lowmem_page_address(ic->journal_xor[0].page), 64, "xor data");
+
+ crypto_free_skcipher(ic->journal_crypt);
+ ic->journal_crypt = NULL;
+ } else {
+ SKCIPHER_REQUEST_ON_STACK(req, ic->journal_crypt);
+ unsigned char iv[ivsize];
+ unsigned crypt_len = roundup(ivsize, blocksize);
+
+ crypt_data = kmalloc(crypt_len, GFP_KERNEL);
+ if (!crypt_data) {
+ *error = "Unable to allocate crypt data";
+ r = -ENOMEM;
+ goto bad;
+ }
+
+ skcipher_request_set_tfm(req, ic->journal_crypt);
+
+ ic->journal_scatterlist = dm_integrity_alloc_journal_scatterlist(ic, ic->journal);
+ if (!ic->journal_scatterlist) {
+ *error = "Unable to allocate sg list";
+ r = -ENOMEM;
+ goto bad;
+ }
+ ic->journal_io_scatterlist = dm_integrity_alloc_journal_scatterlist(ic, ic->journal_io);
+ if (!ic->journal_io_scatterlist) {
+ *error = "Unable to allocate sg list";
+ r = -ENOMEM;
+ goto bad;
+ }
+ ic->sk_requests = kvmalloc(ic->journal_sections * sizeof(struct skcipher_request *), GFP_KERNEL | __GFP_ZERO);
+ if (!ic->sk_requests) {
+ *error = "Unable to allocate sk requests";
+ r = -ENOMEM;
+ goto bad;
+ }
+ for (i = 0; i < ic->journal_sections; i++) {
+ struct scatterlist sg;
+ struct skcipher_request *section_req;
+ __u32 section_le = cpu_to_le32(i);
+
+ memset(iv, 0x00, ivsize);
+ memset(crypt_data, 0x00, crypt_len);
+ memcpy(crypt_data, &section_le, min((size_t)crypt_len, sizeof(section_le)));
+
+ sg_init_one(&sg, crypt_data, crypt_len);
+ skcipher_request_set_crypt(req, &sg, &sg, crypt_len, iv);
+ comp.comp = COMPLETION_INITIALIZER_ONSTACK(comp.comp);
+ comp.in_flight = (atomic_t)ATOMIC_INIT(1);
+ if (do_crypt(true, req, &comp))
+ wait_for_completion(&comp.comp);
+
+ r = dm_integrity_failed(ic);
+ if (r) {
+ *error = "Unable to generate iv";
+ goto bad;
+ }
+
+ section_req = skcipher_request_alloc(ic->journal_crypt, GFP_KERNEL);
+ if (!section_req) {
+ *error = "Unable to allocate crypt request";
+ r = -ENOMEM;
+ goto bad;
+ }
+ section_req->iv = kmalloc(ivsize * 2, GFP_KERNEL);
+ if (!section_req->iv) {
+ skcipher_request_free(section_req);
+ *error = "Unable to allocate iv";
+ r = -ENOMEM;
+ goto bad;
+ }
+ memcpy(section_req->iv + ivsize, crypt_data, ivsize);
+ section_req->cryptlen = (size_t)ic->journal_section_sectors << SECTOR_SHIFT;
+ ic->sk_requests[i] = section_req;
+ DEBUG_bytes(crypt_data, ivsize, "iv(%u)", i);
+ }
+ }
+ }
+
+ for (i = 0; i < N_COMMIT_IDS; i++) {
+ unsigned j;
+retest_commit_id:
+ for (j = 0; j < i; j++) {
+ if (ic->commit_ids[j] == ic->commit_ids[i]) {
+ ic->commit_ids[i] = cpu_to_le64(le64_to_cpu(ic->commit_ids[i]) + 1);
+ goto retest_commit_id;
+ }
+ }
+ DEBUG_print("commit id %u: %016llx\n", i, ic->commit_ids[i]);
+ }
+
+ journal_tree_size = (__u64)ic->journal_entries * sizeof(struct journal_node);
+ if (journal_tree_size > ULONG_MAX) {
+ *error = "Journal doesn't fit into memory";
+ r = -ENOMEM;
+ goto bad;
+ }
+ ic->journal_tree = kvmalloc(journal_tree_size, GFP_KERNEL);
+ if (!ic->journal_tree) {
+ *error = "Could not allocate memory for journal tree";
+ r = -ENOMEM;
+ }
+bad:
+ kfree(crypt_data);
+ return r;
+}
+
+/*
+ * Construct a integrity mapping
+ *
+ * Arguments:
+ * device
+ * offset from the start of the device
+ * tag size
+ * D - direct writes, J - journal writes, R - recovery mode
+ * number of optional arguments
+ * optional arguments:
+ * journal_sectors
+ * interleave_sectors
+ * buffer_sectors
+ * journal_watermark
+ * commit_time
+ * internal_hash
+ * journal_crypt
+ * journal_mac
+ * block_size
+ */
+static int dm_integrity_ctr(struct dm_target *ti, unsigned argc, char **argv)
+{
+ struct dm_integrity_c *ic;
+ char dummy;
+ int r;
+ unsigned extra_args;
+ struct dm_arg_set as;
+ static struct dm_arg _args[] = {
+ {0, 9, "Invalid number of feature args"},
+ };
+ unsigned journal_sectors, interleave_sectors, buffer_sectors, journal_watermark, sync_msec;
+ bool should_write_sb;
+ __u64 threshold;
+ unsigned long long start;
+
+#define DIRECT_ARGUMENTS 4
+
+ if (argc <= DIRECT_ARGUMENTS) {
+ ti->error = "Invalid argument count";
+ return -EINVAL;
+ }
+
+ ic = kzalloc(sizeof(struct dm_integrity_c), GFP_KERNEL);
+ if (!ic) {
+ ti->error = "Cannot allocate integrity context";
+ return -ENOMEM;
+ }
+ ti->private = ic;
+ ti->per_io_data_size = sizeof(struct dm_integrity_io);
+
+ ic->in_progress = RB_ROOT;
+ init_waitqueue_head(&ic->endio_wait);
+ bio_list_init(&ic->flush_bio_list);
+ init_waitqueue_head(&ic->copy_to_journal_wait);
+ init_completion(&ic->crypto_backoff);
+
+ r = dm_get_device(ti, argv[0], dm_table_get_mode(ti->table), &ic->dev);
+ if (r) {
+ ti->error = "Device lookup failed";
+ goto bad;
+ }
+
+ if (sscanf(argv[1], "%llu%c", &start, &dummy) != 1 || start != (sector_t)start) {
+ ti->error = "Invalid starting offset";
+ r = -EINVAL;
+ goto bad;
+ }
+ ic->start = start;
+
+ if (strcmp(argv[2], "-")) {
+ if (sscanf(argv[2], "%u%c", &ic->tag_size, &dummy) != 1 || !ic->tag_size) {
+ ti->error = "Invalid tag size";
+ r = -EINVAL;
+ goto bad;
+ }
+ }
+
+ if (!strcmp(argv[3], "J") || !strcmp(argv[3], "D") || !strcmp(argv[3], "R"))
+ ic->mode = argv[3][0];
+ else {
+ ti->error = "Invalid mode (expecting J, D, R)";
+ r = -EINVAL;
+ goto bad;
+ }
+
+ ic->device_sectors = i_size_read(ic->dev->bdev->bd_inode) >> SECTOR_SHIFT;
+ journal_sectors = min((sector_t)DEFAULT_MAX_JOURNAL_SECTORS,
+ ic->device_sectors >> DEFAULT_JOURNAL_SIZE_FACTOR);
+ interleave_sectors = DEFAULT_INTERLEAVE_SECTORS;
+ buffer_sectors = DEFAULT_BUFFER_SECTORS;
+ journal_watermark = DEFAULT_JOURNAL_WATERMARK;
+ sync_msec = DEFAULT_SYNC_MSEC;
+ ic->sectors_per_block = 1;
+
+ as.argc = argc - DIRECT_ARGUMENTS;
+ as.argv = argv + DIRECT_ARGUMENTS;
+ r = dm_read_arg_group(_args, &as, &extra_args, &ti->error);
+ if (r)
+ goto bad;
+
+ while (extra_args--) {
+ const char *opt_string;
+ unsigned val;
+ opt_string = dm_shift_arg(&as);
+ if (!opt_string) {
+ r = -EINVAL;
+ ti->error = "Not enough feature arguments";
+ goto bad;
+ }
+ if (sscanf(opt_string, "journal_sectors:%u%c", &val, &dummy) == 1)
+ journal_sectors = val;
+ else if (sscanf(opt_string, "interleave_sectors:%u%c", &val, &dummy) == 1)
+ interleave_sectors = val;
+ else if (sscanf(opt_string, "buffer_sectors:%u%c", &val, &dummy) == 1)
+ buffer_sectors = val;
+ else if (sscanf(opt_string, "journal_watermark:%u%c", &val, &dummy) == 1 && val <= 100)
+ journal_watermark = val;
+ else if (sscanf(opt_string, "commit_time:%u%c", &val, &dummy) == 1)
+ sync_msec = val;
+ else if (sscanf(opt_string, "block_size:%u%c", &val, &dummy) == 1) {
+ if (val < 1 << SECTOR_SHIFT ||
+ val > MAX_SECTORS_PER_BLOCK << SECTOR_SHIFT ||
+ (val & (val -1))) {
+ r = -EINVAL;
+ ti->error = "Invalid block_size argument";
+ goto bad;
+ }
+ ic->sectors_per_block = val >> SECTOR_SHIFT;
+ } else if (!memcmp(opt_string, "internal_hash:", strlen("internal_hash:"))) {
+ r = get_alg_and_key(opt_string, &ic->internal_hash_alg, &ti->error,
+ "Invalid internal_hash argument");
+ if (r)
+ goto bad;
+ } else if (!memcmp(opt_string, "journal_crypt:", strlen("journal_crypt:"))) {
+ r = get_alg_and_key(opt_string, &ic->journal_crypt_alg, &ti->error,
+ "Invalid journal_crypt argument");
+ if (r)
+ goto bad;
+ } else if (!memcmp(opt_string, "journal_mac:", strlen("journal_mac:"))) {
+ r = get_alg_and_key(opt_string, &ic->journal_mac_alg, &ti->error,
+ "Invalid journal_mac argument");
+ if (r)
+ goto bad;
+ } else {
+ r = -EINVAL;
+ ti->error = "Invalid argument";
+ goto bad;
+ }
+ }
+
+ r = get_mac(&ic->internal_hash, &ic->internal_hash_alg, &ti->error,
+ "Invalid internal hash", "Error setting internal hash key");
+ if (r)
+ goto bad;
+
+ r = get_mac(&ic->journal_mac, &ic->journal_mac_alg, &ti->error,
+ "Invalid journal mac", "Error setting journal mac key");
+ if (r)
+ goto bad;
+
+ if (!ic->tag_size) {
+ if (!ic->internal_hash) {
+ ti->error = "Unknown tag size";
+ r = -EINVAL;
+ goto bad;
+ }
+ ic->tag_size = crypto_shash_digestsize(ic->internal_hash);
+ }
+ if (ic->tag_size > MAX_TAG_SIZE) {
+ ti->error = "Too big tag size";
+ r = -EINVAL;
+ goto bad;
+ }
+ if (!(ic->tag_size & (ic->tag_size - 1)))
+ ic->log2_tag_size = __ffs(ic->tag_size);
+ else
+ ic->log2_tag_size = -1;
+
+ ic->autocommit_jiffies = msecs_to_jiffies(sync_msec);
+ ic->autocommit_msec = sync_msec;
+ setup_timer(&ic->autocommit_timer, autocommit_fn, (unsigned long)ic);
+
+ ic->io = dm_io_client_create();
+ if (IS_ERR(ic->io)) {
+ r = PTR_ERR(ic->io);
+ ic->io = NULL;
+ ti->error = "Cannot allocate dm io";
+ goto bad;
+ }
+
+ ic->journal_io_mempool = mempool_create_slab_pool(JOURNAL_IO_MEMPOOL, journal_io_cache);
+ if (!ic->journal_io_mempool) {
+ r = -ENOMEM;
+ ti->error = "Cannot allocate mempool";
+ goto bad;
+ }
+
+ ic->metadata_wq = alloc_workqueue("dm-integrity-metadata",
+ WQ_MEM_RECLAIM, METADATA_WORKQUEUE_MAX_ACTIVE);
+ if (!ic->metadata_wq) {
+ ti->error = "Cannot allocate workqueue";
+ r = -ENOMEM;
+ goto bad;
+ }
+
+ /*
+ * If this workqueue were percpu, it would cause bio reordering
+ * and reduced performance.
+ */
+ ic->wait_wq = alloc_workqueue("dm-integrity-wait", WQ_MEM_RECLAIM | WQ_UNBOUND, 1);
+ if (!ic->wait_wq) {
+ ti->error = "Cannot allocate workqueue";
+ r = -ENOMEM;
+ goto bad;
+ }
+
+ ic->commit_wq = alloc_workqueue("dm-integrity-commit", WQ_MEM_RECLAIM, 1);
+ if (!ic->commit_wq) {
+ ti->error = "Cannot allocate workqueue";
+ r = -ENOMEM;
+ goto bad;
+ }
+ INIT_WORK(&ic->commit_work, integrity_commit);
+
+ if (ic->mode == 'J') {
+ ic->writer_wq = alloc_workqueue("dm-integrity-writer", WQ_MEM_RECLAIM, 1);
+ if (!ic->writer_wq) {
+ ti->error = "Cannot allocate workqueue";
+ r = -ENOMEM;
+ goto bad;
+ }
+ INIT_WORK(&ic->writer_work, integrity_writer);
+ }
+
+ ic->sb = alloc_pages_exact(SB_SECTORS << SECTOR_SHIFT, GFP_KERNEL);
+ if (!ic->sb) {
+ r = -ENOMEM;
+ ti->error = "Cannot allocate superblock area";
+ goto bad;
+ }
+
+ r = sync_rw_sb(ic, REQ_OP_READ, 0);
+ if (r) {
+ ti->error = "Error reading superblock";
+ goto bad;
+ }
+ should_write_sb = false;
+ if (memcmp(ic->sb->magic, SB_MAGIC, 8)) {
+ if (ic->mode != 'R') {
+ if (memchr_inv(ic->sb, 0, SB_SECTORS << SECTOR_SHIFT)) {
+ r = -EINVAL;
+ ti->error = "The device is not initialized";
+ goto bad;
+ }
+ }
+
+ r = initialize_superblock(ic, journal_sectors, interleave_sectors);
+ if (r) {
+ ti->error = "Could not initialize superblock";
+ goto bad;
+ }
+ if (ic->mode != 'R')
+ should_write_sb = true;
+ }
+
+ if (ic->sb->version != SB_VERSION) {
+ r = -EINVAL;
+ ti->error = "Unknown version";
+ goto bad;
+ }
+ if (le16_to_cpu(ic->sb->integrity_tag_size) != ic->tag_size) {
+ r = -EINVAL;
+ ti->error = "Tag size doesn't match the information in superblock";
+ goto bad;
+ }
+ if (ic->sb->log2_sectors_per_block != __ffs(ic->sectors_per_block)) {
+ r = -EINVAL;
+ ti->error = "Block size doesn't match the information in superblock";
+ goto bad;
+ }
+ /* make sure that ti->max_io_len doesn't overflow */
+ if (ic->sb->log2_interleave_sectors < MIN_LOG2_INTERLEAVE_SECTORS ||
+ ic->sb->log2_interleave_sectors > MAX_LOG2_INTERLEAVE_SECTORS) {
+ r = -EINVAL;
+ ti->error = "Invalid interleave_sectors in the superblock";
+ goto bad;
+ }
+ ic->provided_data_sectors = le64_to_cpu(ic->sb->provided_data_sectors);
+ if (ic->provided_data_sectors != le64_to_cpu(ic->sb->provided_data_sectors)) {
+ /* test for overflow */
+ r = -EINVAL;
+ ti->error = "The superblock has 64-bit device size, but the kernel was compiled with 32-bit sectors";
+ goto bad;
+ }
+ if (!!(ic->sb->flags & cpu_to_le32(SB_FLAG_HAVE_JOURNAL_MAC)) != !!ic->journal_mac_alg.alg_string) {
+ r = -EINVAL;
+ ti->error = "Journal mac mismatch";
+ goto bad;
+ }
+ r = calculate_device_limits(ic);
+ if (r) {
+ ti->error = "The device is too small";
+ goto bad;
+ }
+ if (ti->len > ic->provided_data_sectors) {
+ r = -EINVAL;
+ ti->error = "Not enough provided sectors for requested mapping size";
+ goto bad;
+ }
+
+ if (!buffer_sectors)
+ buffer_sectors = 1;
+ ic->log2_buffer_sectors = min3((int)__fls(buffer_sectors), (int)__ffs(ic->metadata_run), 31 - SECTOR_SHIFT);
+
+ threshold = (__u64)ic->journal_entries * (100 - journal_watermark);
+ threshold += 50;
+ do_div(threshold, 100);
+ ic->free_sectors_threshold = threshold;
+
+ DEBUG_print("initialized:\n");
+ DEBUG_print(" integrity_tag_size %u\n", le16_to_cpu(ic->sb->integrity_tag_size));
+ DEBUG_print(" journal_entry_size %u\n", ic->journal_entry_size);
+ DEBUG_print(" journal_entries_per_sector %u\n", ic->journal_entries_per_sector);
+ DEBUG_print(" journal_section_entries %u\n", ic->journal_section_entries);
+ DEBUG_print(" journal_section_sectors %u\n", ic->journal_section_sectors);
+ DEBUG_print(" journal_sections %u\n", (unsigned)le32_to_cpu(ic->sb->journal_sections));
+ DEBUG_print(" journal_entries %u\n", ic->journal_entries);
+ DEBUG_print(" log2_interleave_sectors %d\n", ic->sb->log2_interleave_sectors);
+ DEBUG_print(" device_sectors 0x%llx\n", (unsigned long long)ic->device_sectors);
+ DEBUG_print(" initial_sectors 0x%x\n", ic->initial_sectors);
+ DEBUG_print(" metadata_run 0x%x\n", ic->metadata_run);
+ DEBUG_print(" log2_metadata_run %d\n", ic->log2_metadata_run);
+ DEBUG_print(" provided_data_sectors 0x%llx (%llu)\n", (unsigned long long)ic->provided_data_sectors,
+ (unsigned long long)ic->provided_data_sectors);
+ DEBUG_print(" log2_buffer_sectors %u\n", ic->log2_buffer_sectors);
+
+ ic->bufio = dm_bufio_client_create(ic->dev->bdev, 1U << (SECTOR_SHIFT + ic->log2_buffer_sectors),
+ 1, 0, NULL, NULL);
+ if (IS_ERR(ic->bufio)) {
+ r = PTR_ERR(ic->bufio);
+ ti->error = "Cannot initialize dm-bufio";
+ ic->bufio = NULL;
+ goto bad;
+ }
+ dm_bufio_set_sector_offset(ic->bufio, ic->start + ic->initial_sectors);
+
+ if (ic->mode != 'R') {
+ r = create_journal(ic, &ti->error);
+ if (r)
+ goto bad;
+ }
+
+ if (should_write_sb) {
+ int r;
+
+ init_journal(ic, 0, ic->journal_sections, 0);
+ r = dm_integrity_failed(ic);
+ if (unlikely(r)) {
+ ti->error = "Error initializing journal";
+ goto bad;
+ }
+ r = sync_rw_sb(ic, REQ_OP_WRITE, REQ_FUA);
+ if (r) {
+ ti->error = "Error initializing superblock";
+ goto bad;
+ }
+ ic->just_formatted = true;
+ }
+
+ r = dm_set_target_max_io_len(ti, 1U << ic->sb->log2_interleave_sectors);
+ if (r)
+ goto bad;
+
+ if (!ic->internal_hash)
+ dm_integrity_set(ti, ic);
+
+ ti->num_flush_bios = 1;
+ ti->flush_supported = true;
+
+ return 0;
+bad:
+ dm_integrity_dtr(ti);
+ return r;
+}
+
+static void dm_integrity_dtr(struct dm_target *ti)
+{
+ struct dm_integrity_c *ic = ti->private;
+
+ BUG_ON(!RB_EMPTY_ROOT(&ic->in_progress));
+
+ if (ic->metadata_wq)
+ destroy_workqueue(ic->metadata_wq);
+ if (ic->wait_wq)
+ destroy_workqueue(ic->wait_wq);
+ if (ic->commit_wq)
+ destroy_workqueue(ic->commit_wq);
+ if (ic->writer_wq)
+ destroy_workqueue(ic->writer_wq);
+ if (ic->bufio)
+ dm_bufio_client_destroy(ic->bufio);
+ mempool_destroy(ic->journal_io_mempool);
+ if (ic->io)
+ dm_io_client_destroy(ic->io);
+ if (ic->dev)
+ dm_put_device(ti, ic->dev);
+ dm_integrity_free_page_list(ic, ic->journal);
+ dm_integrity_free_page_list(ic, ic->journal_io);
+ dm_integrity_free_page_list(ic, ic->journal_xor);
+ if (ic->journal_scatterlist)
+ dm_integrity_free_journal_scatterlist(ic, ic->journal_scatterlist);
+ if (ic->journal_io_scatterlist)
+ dm_integrity_free_journal_scatterlist(ic, ic->journal_io_scatterlist);
+ if (ic->sk_requests) {
+ unsigned i;
+
+ for (i = 0; i < ic->journal_sections; i++) {
+ struct skcipher_request *req = ic->sk_requests[i];
+ if (req) {
+ kzfree(req->iv);
+ skcipher_request_free(req);
+ }
+ }
+ kvfree(ic->sk_requests);
+ }
+ kvfree(ic->journal_tree);
+ if (ic->sb)
+ free_pages_exact(ic->sb, SB_SECTORS << SECTOR_SHIFT);
+
+ if (ic->internal_hash)
+ crypto_free_shash(ic->internal_hash);
+ free_alg(&ic->internal_hash_alg);
+
+ if (ic->journal_crypt)
+ crypto_free_skcipher(ic->journal_crypt);
+ free_alg(&ic->journal_crypt_alg);
+
+ if (ic->journal_mac)
+ crypto_free_shash(ic->journal_mac);
+ free_alg(&ic->journal_mac_alg);
+
+ kfree(ic);
+}
+
+static struct target_type integrity_target = {
+ .name = "integrity",
+ .version = {1, 0, 0},
+ .module = THIS_MODULE,
+ .features = DM_TARGET_SINGLETON | DM_TARGET_INTEGRITY,
+ .ctr = dm_integrity_ctr,
+ .dtr = dm_integrity_dtr,
+ .map = dm_integrity_map,
+ .postsuspend = dm_integrity_postsuspend,
+ .resume = dm_integrity_resume,
+ .status = dm_integrity_status,
+ .iterate_devices = dm_integrity_iterate_devices,
+ .io_hints = dm_integrity_io_hints,
+};
+
+int __init dm_integrity_init(void)
+{
+ int r;
+
+ journal_io_cache = kmem_cache_create("integrity_journal_io",
+ sizeof(struct journal_io), 0, 0, NULL);
+ if (!journal_io_cache) {
+ DMERR("can't allocate journal io cache");
+ return -ENOMEM;
+ }
+
+ r = dm_register_target(&integrity_target);
+
+ if (r < 0)
+ DMERR("register failed %d", r);
+
+ return r;
+}
+
+void dm_integrity_exit(void)
+{
+ dm_unregister_target(&integrity_target);
+ kmem_cache_destroy(journal_io_cache);
+}
+
+module_init(dm_integrity_init);
+module_exit(dm_integrity_exit);
+
+MODULE_AUTHOR("Milan Broz");
+MODULE_AUTHOR("Mikulas Patocka");
+MODULE_DESCRIPTION(DM_NAME " target for integrity tags extension");
+MODULE_LICENSE("GPL");
diff --git a/drivers/md/dm-io.c b/drivers/md/dm-io.c
index 03940bf36f6c..25039607f3cb 100644
--- a/drivers/md/dm-io.c
+++ b/drivers/md/dm-io.c
@@ -58,7 +58,8 @@ struct dm_io_client *dm_io_client_create(void)
if (!client->pool)
goto bad;
- client->bios = bioset_create(min_ios, 0);
+ client->bios = bioset_create(min_ios, 0, (BIOSET_NEED_BVECS |
+ BIOSET_NEED_RESCUER));
if (!client->bios)
goto bad;
@@ -124,7 +125,7 @@ static void complete_io(struct io *io)
fn(error_bits, context);
}
-static void dec_count(struct io *io, unsigned int region, int error)
+static void dec_count(struct io *io, unsigned int region, blk_status_t error)
{
if (error)
set_bit(region, &io->error_bits);
@@ -137,9 +138,9 @@ static void endio(struct bio *bio)
{
struct io *io;
unsigned region;
- int error;
+ blk_status_t error;
- if (bio->bi_error && bio_data_dir(bio) == READ)
+ if (bio->bi_status && bio_data_dir(bio) == READ)
zero_fill_bio(bio);
/*
@@ -147,7 +148,7 @@ static void endio(struct bio *bio)
*/
retrieve_io_and_region_from_bio(bio, &io, &region);
- error = bio->bi_error;
+ error = bio->bi_status;
bio_put(bio);
dec_count(io, region, error);
@@ -312,11 +313,14 @@ static void do_region(int op, int op_flags, unsigned region,
*/
if (op == REQ_OP_DISCARD)
special_cmd_max_sectors = q->limits.max_discard_sectors;
+ else if (op == REQ_OP_WRITE_ZEROES)
+ special_cmd_max_sectors = q->limits.max_write_zeroes_sectors;
else if (op == REQ_OP_WRITE_SAME)
special_cmd_max_sectors = q->limits.max_write_same_sectors;
- if ((op == REQ_OP_DISCARD || op == REQ_OP_WRITE_SAME) &&
- special_cmd_max_sectors == 0) {
- dec_count(io, region, -EOPNOTSUPP);
+ if ((op == REQ_OP_DISCARD || op == REQ_OP_WRITE_ZEROES ||
+ op == REQ_OP_WRITE_SAME) && special_cmd_max_sectors == 0) {
+ atomic_inc(&io->count);
+ dec_count(io, region, BLK_STS_NOTSUPP);
return;
}
@@ -328,11 +332,18 @@ static void do_region(int op, int op_flags, unsigned region,
/*
* Allocate a suitably sized-bio.
*/
- if ((op == REQ_OP_DISCARD) || (op == REQ_OP_WRITE_SAME))
+ switch (op) {
+ case REQ_OP_DISCARD:
+ case REQ_OP_WRITE_ZEROES:
+ num_bvecs = 0;
+ break;
+ case REQ_OP_WRITE_SAME:
num_bvecs = 1;
- else
+ break;
+ default:
num_bvecs = min_t(int, BIO_MAX_PAGES,
dm_sector_div_up(remaining, (PAGE_SIZE >> SECTOR_SHIFT)));
+ }
bio = bio_alloc_bioset(GFP_NOIO, num_bvecs, io->client->bios);
bio->bi_iter.bi_sector = where->sector + (where->count - remaining);
@@ -341,7 +352,7 @@ static void do_region(int op, int op_flags, unsigned region,
bio_set_op_attrs(bio, op, op_flags);
store_io_and_region_in_bio(bio, io, region);
- if (op == REQ_OP_DISCARD) {
+ if (op == REQ_OP_DISCARD || op == REQ_OP_WRITE_ZEROES) {
num_sectors = min_t(sector_t, special_cmd_max_sectors, remaining);
bio->bi_iter.bi_size = num_sectors << SECTOR_SHIFT;
remaining -= num_sectors;
diff --git a/drivers/md/dm-ioctl.c b/drivers/md/dm-ioctl.c
index a5a9b17f0f7f..e06f0ef7d2ec 100644
--- a/drivers/md/dm-ioctl.c
+++ b/drivers/md/dm-ioctl.c
@@ -10,6 +10,7 @@
#include <linux/module.h>
#include <linux/vmalloc.h>
#include <linux/miscdevice.h>
+#include <linux/sched/mm.h>
#include <linux/init.h>
#include <linux/wait.h>
#include <linux/slab.h>
@@ -22,6 +23,14 @@
#define DM_MSG_PREFIX "ioctl"
#define DM_DRIVER_EMAIL "[email protected]"
+struct dm_file {
+ /*
+ * poll will wait until the global event number is greater than
+ * this value.
+ */
+ volatile unsigned global_event_nr;
+};
+
/*-----------------------------------------------------------------
* The ioctl interface needs to be able to look up devices by
* name or uuid.
@@ -36,14 +45,6 @@ struct hash_cell {
struct dm_table *new_map;
};
-/*
- * A dummy definition to make RCU happy.
- * struct dm_table should never be dereferenced in this file.
- */
-struct dm_table {
- int undefined__;
-};
-
struct vers_iter {
size_t param_size;
struct dm_target_versions *vers, *old_vers;
@@ -463,9 +464,9 @@ void dm_deferred_remove(void)
* All the ioctl commands get dispatched to functions with this
* prototype.
*/
-typedef int (*ioctl_fn)(struct dm_ioctl *param, size_t param_size);
+typedef int (*ioctl_fn)(struct file *filp, struct dm_ioctl *param, size_t param_size);
-static int remove_all(struct dm_ioctl *param, size_t param_size)
+static int remove_all(struct file *filp, struct dm_ioctl *param, size_t param_size)
{
dm_hash_remove_all(true, !!(param->flags & DM_DEFERRED_REMOVE), false);
param->data_size = 0;
@@ -498,13 +499,14 @@ static void *get_result_buffer(struct dm_ioctl *param, size_t param_size,
return ((void *) param) + param->data_start;
}
-static int list_devices(struct dm_ioctl *param, size_t param_size)
+static int list_devices(struct file *filp, struct dm_ioctl *param, size_t param_size)
{
unsigned int i;
struct hash_cell *hc;
size_t len, needed = 0;
struct gendisk *disk;
struct dm_name_list *nl, *old_nl = NULL;
+ uint32_t *event_nr;
down_write(&_hash_lock);
@@ -517,6 +519,7 @@ static int list_devices(struct dm_ioctl *param, size_t param_size)
needed += sizeof(struct dm_name_list);
needed += strlen(hc->name) + 1;
needed += ALIGN_MASK;
+ needed += (sizeof(uint32_t) + ALIGN_MASK) & ~ALIGN_MASK;
}
}
@@ -546,7 +549,9 @@ static int list_devices(struct dm_ioctl *param, size_t param_size)
strcpy(nl->name, hc->name);
old_nl = nl;
- nl = align_ptr(((void *) ++nl) + strlen(hc->name) + 1);
+ event_nr = align_ptr(((void *) (nl + 1)) + strlen(hc->name) + 1);
+ *event_nr = dm_get_event_nr(hc->md);
+ nl = align_ptr(event_nr + 1);
}
}
@@ -589,7 +594,7 @@ static void list_version_get_info(struct target_type *tt, void *param)
info->vers = align_ptr(((void *) ++info->vers) + strlen(tt->name) + 1);
}
-static int list_versions(struct dm_ioctl *param, size_t param_size)
+static int list_versions(struct file *filp, struct dm_ioctl *param, size_t param_size)
{
size_t len, needed = 0;
struct dm_target_versions *vers;
@@ -731,7 +736,7 @@ static void __dev_status(struct mapped_device *md, struct dm_ioctl *param)
}
}
-static int dev_create(struct dm_ioctl *param, size_t param_size)
+static int dev_create(struct file *filp, struct dm_ioctl *param, size_t param_size)
{
int r, m = DM_ANY_MINOR;
struct mapped_device *md;
@@ -823,7 +828,7 @@ static struct mapped_device *find_device(struct dm_ioctl *param)
return md;
}
-static int dev_remove(struct dm_ioctl *param, size_t param_size)
+static int dev_remove(struct file *filp, struct dm_ioctl *param, size_t param_size)
{
struct hash_cell *hc;
struct mapped_device *md;
@@ -888,7 +893,7 @@ static int invalid_str(char *str, void *end)
return -EINVAL;
}
-static int dev_rename(struct dm_ioctl *param, size_t param_size)
+static int dev_rename(struct file *filp, struct dm_ioctl *param, size_t param_size)
{
int r;
char *new_data = (char *) param + param->data_start;
@@ -918,7 +923,7 @@ static int dev_rename(struct dm_ioctl *param, size_t param_size)
return 0;
}
-static int dev_set_geometry(struct dm_ioctl *param, size_t param_size)
+static int dev_set_geometry(struct file *filp, struct dm_ioctl *param, size_t param_size)
{
int r = -EINVAL, x;
struct mapped_device *md;
@@ -1067,7 +1072,7 @@ static int do_resume(struct dm_ioctl *param)
* Set or unset the suspension state of a device.
* If the device already is in the requested state we just return its status.
*/
-static int dev_suspend(struct dm_ioctl *param, size_t param_size)
+static int dev_suspend(struct file *filp, struct dm_ioctl *param, size_t param_size)
{
if (param->flags & DM_SUSPEND_FLAG)
return do_suspend(param);
@@ -1079,7 +1084,7 @@ static int dev_suspend(struct dm_ioctl *param, size_t param_size)
* Copies device info back to user space, used by
* the create and info ioctls.
*/
-static int dev_status(struct dm_ioctl *param, size_t param_size)
+static int dev_status(struct file *filp, struct dm_ioctl *param, size_t param_size)
{
struct mapped_device *md;
@@ -1170,7 +1175,7 @@ static void retrieve_status(struct dm_table *table,
/*
* Wait for a device to report an event
*/
-static int dev_wait(struct dm_ioctl *param, size_t param_size)
+static int dev_wait(struct file *filp, struct dm_ioctl *param, size_t param_size)
{
int r = 0;
struct mapped_device *md;
@@ -1207,6 +1212,19 @@ out:
return r;
}
+/*
+ * Remember the global event number and make it possible to poll
+ * for further events.
+ */
+static int dev_arm_poll(struct file *filp, struct dm_ioctl *param, size_t param_size)
+{
+ struct dm_file *priv = filp->private_data;
+
+ priv->global_event_nr = atomic_read(&dm_global_event_nr);
+
+ return 0;
+}
+
static inline fmode_t get_mode(struct dm_ioctl *param)
{
fmode_t mode = FMODE_READ | FMODE_WRITE;
@@ -1267,7 +1285,7 @@ static int populate_table(struct dm_table *table,
return dm_table_complete(table);
}
-static bool is_valid_type(unsigned cur, unsigned new)
+static bool is_valid_type(enum dm_queue_mode cur, enum dm_queue_mode new)
{
if (cur == new ||
(cur == DM_TYPE_BIO_BASED && new == DM_TYPE_DAX_BIO_BASED))
@@ -1276,7 +1294,7 @@ static bool is_valid_type(unsigned cur, unsigned new)
return false;
}
-static int table_load(struct dm_ioctl *param, size_t param_size)
+static int table_load(struct file *filp, struct dm_ioctl *param, size_t param_size)
{
int r;
struct hash_cell *hc;
@@ -1363,7 +1381,7 @@ err:
return r;
}
-static int table_clear(struct dm_ioctl *param, size_t param_size)
+static int table_clear(struct file *filp, struct dm_ioctl *param, size_t param_size)
{
struct hash_cell *hc;
struct mapped_device *md;
@@ -1437,7 +1455,7 @@ static void retrieve_deps(struct dm_table *table,
param->data_size = param->data_start + needed;
}
-static int table_deps(struct dm_ioctl *param, size_t param_size)
+static int table_deps(struct file *filp, struct dm_ioctl *param, size_t param_size)
{
struct mapped_device *md;
struct dm_table *table;
@@ -1463,7 +1481,7 @@ static int table_deps(struct dm_ioctl *param, size_t param_size)
* Return the status of a device as a text string for each
* target.
*/
-static int table_status(struct dm_ioctl *param, size_t param_size)
+static int table_status(struct file *filp, struct dm_ioctl *param, size_t param_size)
{
struct mapped_device *md;
struct dm_table *table;
@@ -1518,7 +1536,7 @@ static int message_for_md(struct mapped_device *md, unsigned argc, char **argv,
/*
* Pass a message to the target that's at the supplied device offset.
*/
-static int target_message(struct dm_ioctl *param, size_t param_size)
+static int target_message(struct file *filp, struct dm_ioctl *param, size_t param_size)
{
int r, argc;
char **argv;
@@ -1635,7 +1653,8 @@ static ioctl_fn lookup_ioctl(unsigned int cmd, int *ioctl_flags)
{DM_LIST_VERSIONS_CMD, 0, list_versions},
{DM_TARGET_MSG_CMD, 0, target_message},
- {DM_DEV_SET_GEOMETRY_CMD, 0, dev_set_geometry}
+ {DM_DEV_SET_GEOMETRY_CMD, 0, dev_set_geometry},
+ {DM_DEV_ARM_POLL, IOCTL_FLAGS_NO_PARAMS, dev_arm_poll},
};
if (unlikely(cmd >= ARRAY_SIZE(_ioctls)))
@@ -1698,6 +1717,7 @@ static int copy_params(struct dm_ioctl __user *user, struct dm_ioctl *param_kern
struct dm_ioctl *dmi;
int secure_data;
const size_t minimum_data_size = offsetof(struct dm_ioctl, data);
+ unsigned noio_flag;
if (copy_from_user(param_kernel, user, minimum_data_size))
return -EFAULT;
@@ -1716,19 +1736,14 @@ static int copy_params(struct dm_ioctl __user *user, struct dm_ioctl *param_kern
}
/*
- * Try to avoid low memory issues when a device is suspended.
+ * Use __GFP_HIGH to avoid low memory issues when a device is
+ * suspended and the ioctl is needed to resume it.
* Use kmalloc() rather than vmalloc() when we can.
*/
dmi = NULL;
- if (param_kernel->data_size <= KMALLOC_MAX_SIZE)
- dmi = kmalloc(param_kernel->data_size, GFP_NOIO | __GFP_NORETRY | __GFP_NOMEMALLOC | __GFP_NOWARN);
-
- if (!dmi) {
- unsigned noio_flag;
- noio_flag = memalloc_noio_save();
- dmi = __vmalloc(param_kernel->data_size, GFP_NOIO | __GFP_HIGH | __GFP_HIGHMEM, PAGE_KERNEL);
- memalloc_noio_restore(noio_flag);
- }
+ noio_flag = memalloc_noio_save();
+ dmi = kvmalloc(param_kernel->data_size, GFP_KERNEL | __GFP_HIGH);
+ memalloc_noio_restore(noio_flag);
if (!dmi) {
if (secure_data && clear_user(user, param_kernel->data_size))
@@ -1777,12 +1792,12 @@ static int validate_params(uint cmd, struct dm_ioctl *param)
cmd == DM_LIST_VERSIONS_CMD)
return 0;
- if ((cmd == DM_DEV_CREATE_CMD)) {
+ if (cmd == DM_DEV_CREATE_CMD) {
if (!*param->name) {
DMWARN("name not supplied when creating device");
return -EINVAL;
}
- } else if ((*param->uuid && *param->name)) {
+ } else if (*param->uuid && *param->name) {
DMWARN("only supply one of name or uuid, cmd(%u)", cmd);
return -EINVAL;
}
@@ -1794,7 +1809,7 @@ static int validate_params(uint cmd, struct dm_ioctl *param)
return 0;
}
-static int ctl_ioctl(uint command, struct dm_ioctl __user *user)
+static int ctl_ioctl(struct file *file, uint command, struct dm_ioctl __user *user)
{
int r = 0;
int ioctl_flags;
@@ -1847,8 +1862,8 @@ static int ctl_ioctl(uint command, struct dm_ioctl __user *user)
if (r)
goto out;
- param->data_size = sizeof(*param);
- r = fn(param, input_param_size);
+ param->data_size = offsetof(struct dm_ioctl, data);
+ r = fn(file, param, input_param_size);
if (unlikely(param->flags & DM_BUFFER_FULL_FLAG) &&
unlikely(ioctl_flags & IOCTL_FLAGS_NO_PARAMS))
@@ -1867,7 +1882,7 @@ out:
static long dm_ctl_ioctl(struct file *file, uint command, ulong u)
{
- return (long)ctl_ioctl(command, (struct dm_ioctl __user *)u);
+ return (long)ctl_ioctl(file, command, (struct dm_ioctl __user *)u);
}
#ifdef CONFIG_COMPAT
@@ -1879,8 +1894,47 @@ static long dm_compat_ctl_ioctl(struct file *file, uint command, ulong u)
#define dm_compat_ctl_ioctl NULL
#endif
+static int dm_open(struct inode *inode, struct file *filp)
+{
+ int r;
+ struct dm_file *priv;
+
+ r = nonseekable_open(inode, filp);
+ if (unlikely(r))
+ return r;
+
+ priv = filp->private_data = kmalloc(sizeof(struct dm_file), GFP_KERNEL);
+ if (!priv)
+ return -ENOMEM;
+
+ priv->global_event_nr = atomic_read(&dm_global_event_nr);
+
+ return 0;
+}
+
+static int dm_release(struct inode *inode, struct file *filp)
+{
+ kfree(filp->private_data);
+ return 0;
+}
+
+static unsigned dm_poll(struct file *filp, poll_table *wait)
+{
+ struct dm_file *priv = filp->private_data;
+ unsigned mask = 0;
+
+ poll_wait(filp, &dm_global_eventq, wait);
+
+ if ((int)(atomic_read(&dm_global_event_nr) - priv->global_event_nr) > 0)
+ mask |= POLLIN;
+
+ return mask;
+}
+
static const struct file_operations _ctl_fops = {
- .open = nonseekable_open,
+ .open = dm_open,
+ .release = dm_release,
+ .poll = dm_poll,
.unlocked_ioctl = dm_ctl_ioctl,
.compat_ioctl = dm_compat_ctl_ioctl,
.owner = THIS_MODULE,
diff --git a/drivers/md/dm-kcopyd.c b/drivers/md/dm-kcopyd.c
index 9e9d04cb7d51..cf2c67e35eaf 100644
--- a/drivers/md/dm-kcopyd.c
+++ b/drivers/md/dm-kcopyd.c
@@ -356,6 +356,7 @@ struct kcopyd_job {
struct mutex lock;
atomic_t sub_jobs;
sector_t progress;
+ sector_t write_offset;
struct kcopyd_job *master_job;
};
@@ -386,6 +387,31 @@ void dm_kcopyd_exit(void)
* Functions to push and pop a job onto the head of a given job
* list.
*/
+static struct kcopyd_job *pop_io_job(struct list_head *jobs,
+ struct dm_kcopyd_client *kc)
+{
+ struct kcopyd_job *job;
+
+ /*
+ * For I/O jobs, pop any read, any write without sequential write
+ * constraint and sequential writes that are at the right position.
+ */
+ list_for_each_entry(job, jobs, list) {
+ if (job->rw == READ || !test_bit(DM_KCOPYD_WRITE_SEQ, &job->flags)) {
+ list_del(&job->list);
+ return job;
+ }
+
+ if (job->write_offset == job->master_job->write_offset) {
+ job->master_job->write_offset += job->source.count;
+ list_del(&job->list);
+ return job;
+ }
+ }
+
+ return NULL;
+}
+
static struct kcopyd_job *pop(struct list_head *jobs,
struct dm_kcopyd_client *kc)
{
@@ -395,8 +421,12 @@ static struct kcopyd_job *pop(struct list_head *jobs,
spin_lock_irqsave(&kc->job_lock, flags);
if (!list_empty(jobs)) {
- job = list_entry(jobs->next, struct kcopyd_job, list);
- list_del(&job->list);
+ if (jobs == &kc->io_jobs)
+ job = pop_io_job(jobs, kc);
+ else {
+ job = list_entry(jobs->next, struct kcopyd_job, list);
+ list_del(&job->list);
+ }
}
spin_unlock_irqrestore(&kc->job_lock, flags);
@@ -506,6 +536,14 @@ static int run_io_job(struct kcopyd_job *job)
.client = job->kc->io_client,
};
+ /*
+ * If we need to write sequentially and some reads or writes failed,
+ * no point in continuing.
+ */
+ if (test_bit(DM_KCOPYD_WRITE_SEQ, &job->flags) &&
+ job->master_job->write_err)
+ return -EIO;
+
io_job_start(job->kc->throttle);
if (job->rw == READ)
@@ -655,6 +693,7 @@ static void segment_complete(int read_err, unsigned long write_err,
int i;
*sub_job = *job;
+ sub_job->write_offset = progress;
sub_job->source.sector += progress;
sub_job->source.count = count;
@@ -723,6 +762,27 @@ int dm_kcopyd_copy(struct dm_kcopyd_client *kc, struct dm_io_region *from,
job->num_dests = num_dests;
memcpy(&job->dests, dests, sizeof(*dests) * num_dests);
+ /*
+ * If one of the destination is a host-managed zoned block device,
+ * we need to write sequentially. If one of the destination is a
+ * host-aware device, then leave it to the caller to choose what to do.
+ */
+ if (!test_bit(DM_KCOPYD_WRITE_SEQ, &job->flags)) {
+ for (i = 0; i < job->num_dests; i++) {
+ if (bdev_zoned_model(dests[i].bdev) == BLK_ZONED_HM) {
+ set_bit(DM_KCOPYD_WRITE_SEQ, &job->flags);
+ break;
+ }
+ }
+ }
+
+ /*
+ * If we need to write sequentially, errors cannot be ignored.
+ */
+ if (test_bit(DM_KCOPYD_WRITE_SEQ, &job->flags) &&
+ test_bit(DM_KCOPYD_IGNORE_ERROR, &job->flags))
+ clear_bit(DM_KCOPYD_IGNORE_ERROR, &job->flags);
+
if (from) {
job->source = *from;
job->pages = NULL;
@@ -733,11 +793,11 @@ int dm_kcopyd_copy(struct dm_kcopyd_client *kc, struct dm_io_region *from,
job->pages = &zero_page_list;
/*
- * Use WRITE SAME to optimize zeroing if all dests support it.
+ * Use WRITE ZEROES to optimize zeroing if all dests support it.
*/
- job->rw = REQ_OP_WRITE_SAME;
+ job->rw = REQ_OP_WRITE_ZEROES;
for (i = 0; i < job->num_dests; i++)
- if (!bdev_write_same(job->dests[i].bdev)) {
+ if (!bdev_write_zeroes_sectors(job->dests[i].bdev)) {
job->rw = WRITE;
break;
}
@@ -746,6 +806,7 @@ int dm_kcopyd_copy(struct dm_kcopyd_client *kc, struct dm_io_region *from,
job->fn = fn;
job->context = context;
job->master_job = job;
+ job->write_offset = 0;
if (job->source.count <= SUB_JOB_SIZE)
dispatch_job(job);
diff --git a/drivers/md/dm-linear.c b/drivers/md/dm-linear.c
index 4788b0b989a9..41971a090e34 100644
--- a/drivers/md/dm-linear.c
+++ b/drivers/md/dm-linear.c
@@ -9,6 +9,7 @@
#include <linux/init.h>
#include <linux/blkdev.h>
#include <linux/bio.h>
+#include <linux/dax.h>
#include <linux/slab.h>
#include <linux/device-mapper.h>
@@ -59,6 +60,7 @@ static int linear_ctr(struct dm_target *ti, unsigned int argc, char **argv)
ti->num_flush_bios = 1;
ti->num_discard_bios = 1;
ti->num_write_same_bios = 1;
+ ti->num_write_zeroes_bios = 1;
ti->private = lc;
return 0;
@@ -87,7 +89,7 @@ static void linear_map_bio(struct dm_target *ti, struct bio *bio)
struct linear_c *lc = ti->private;
bio->bi_bdev = lc->dev->bdev;
- if (bio_sectors(bio))
+ if (bio_sectors(bio) || bio_op(bio) == REQ_OP_ZONE_RESET)
bio->bi_iter.bi_sector =
linear_map_sector(ti, bio->bi_iter.bi_sector);
}
@@ -99,6 +101,17 @@ static int linear_map(struct dm_target *ti, struct bio *bio)
return DM_MAPIO_REMAPPED;
}
+static int linear_end_io(struct dm_target *ti, struct bio *bio,
+ blk_status_t *error)
+{
+ struct linear_c *lc = ti->private;
+
+ if (!*error && bio_op(bio) == REQ_OP_ZONE_REPORT)
+ dm_remap_zone_report(ti, bio, lc->start);
+
+ return DM_ENDIO_DONE;
+}
+
static void linear_status(struct dm_target *ti, status_type_t type,
unsigned status_flags, char *result, unsigned maxlen)
{
@@ -141,35 +154,65 @@ static int linear_iterate_devices(struct dm_target *ti,
return fn(ti, lc->dev, lc->start, ti->len, data);
}
-static long linear_direct_access(struct dm_target *ti, sector_t sector,
- void **kaddr, pfn_t *pfn, long size)
+static long linear_dax_direct_access(struct dm_target *ti, pgoff_t pgoff,
+ long nr_pages, void **kaddr, pfn_t *pfn)
{
+ long ret;
struct linear_c *lc = ti->private;
struct block_device *bdev = lc->dev->bdev;
- struct blk_dax_ctl dax = {
- .sector = linear_map_sector(ti, sector),
- .size = size,
- };
- long ret;
+ struct dax_device *dax_dev = lc->dev->dax_dev;
+ sector_t dev_sector, sector = pgoff * PAGE_SECTORS;
+
+ dev_sector = linear_map_sector(ti, sector);
+ ret = bdev_dax_pgoff(bdev, dev_sector, nr_pages * PAGE_SIZE, &pgoff);
+ if (ret)
+ return ret;
+ return dax_direct_access(dax_dev, pgoff, nr_pages, kaddr, pfn);
+}
- ret = bdev_direct_access(bdev, &dax);
- *kaddr = dax.addr;
- *pfn = dax.pfn;
+static size_t linear_dax_copy_from_iter(struct dm_target *ti, pgoff_t pgoff,
+ void *addr, size_t bytes, struct iov_iter *i)
+{
+ struct linear_c *lc = ti->private;
+ struct block_device *bdev = lc->dev->bdev;
+ struct dax_device *dax_dev = lc->dev->dax_dev;
+ sector_t dev_sector, sector = pgoff * PAGE_SECTORS;
- return ret;
+ dev_sector = linear_map_sector(ti, sector);
+ if (bdev_dax_pgoff(bdev, dev_sector, ALIGN(bytes, PAGE_SIZE), &pgoff))
+ return 0;
+ return dax_copy_from_iter(dax_dev, pgoff, addr, bytes, i);
+}
+
+static void linear_dax_flush(struct dm_target *ti, pgoff_t pgoff, void *addr,
+ size_t size)
+{
+ struct linear_c *lc = ti->private;
+ struct block_device *bdev = lc->dev->bdev;
+ struct dax_device *dax_dev = lc->dev->dax_dev;
+ sector_t dev_sector, sector = pgoff * PAGE_SECTORS;
+
+ dev_sector = linear_map_sector(ti, sector);
+ if (bdev_dax_pgoff(bdev, dev_sector, ALIGN(size, PAGE_SIZE), &pgoff))
+ return;
+ dax_flush(dax_dev, pgoff, addr, size);
}
static struct target_type linear_target = {
.name = "linear",
- .version = {1, 3, 0},
+ .version = {1, 4, 0},
+ .features = DM_TARGET_PASSES_INTEGRITY | DM_TARGET_ZONED_HM,
.module = THIS_MODULE,
.ctr = linear_ctr,
.dtr = linear_dtr,
.map = linear_map,
+ .end_io = linear_end_io,
.status = linear_status,
.prepare_ioctl = linear_prepare_ioctl,
.iterate_devices = linear_iterate_devices,
- .direct_access = linear_direct_access,
+ .direct_access = linear_dax_direct_access,
+ .dax_copy_from_iter = linear_dax_copy_from_iter,
+ .dax_flush = linear_dax_flush,
};
int __init dm_linear_init(void)
diff --git a/drivers/md/dm-log-writes.c b/drivers/md/dm-log-writes.c
index 4dfe38655a49..a1da0eb58a93 100644
--- a/drivers/md/dm-log-writes.c
+++ b/drivers/md/dm-log-writes.c
@@ -150,10 +150,10 @@ static void log_end_io(struct bio *bio)
{
struct log_writes_c *lc = bio->bi_private;
- if (bio->bi_error) {
+ if (bio->bi_status) {
unsigned long flags;
- DMERR("Error writing log block, error=%d", bio->bi_error);
+ DMERR("Error writing log block, error=%d", bio->bi_status);
spin_lock_irqsave(&lc->blocks_lock, flags);
lc->logging_enabled = false;
spin_unlock_irqrestore(&lc->blocks_lock, flags);
@@ -586,7 +586,7 @@ static int log_writes_map(struct dm_target *ti, struct bio *bio)
spin_lock_irq(&lc->blocks_lock);
lc->logging_enabled = false;
spin_unlock_irq(&lc->blocks_lock);
- return -ENOMEM;
+ return DM_MAPIO_KILL;
}
INIT_LIST_HEAD(&block->list);
pb->block = block;
@@ -639,7 +639,7 @@ static int log_writes_map(struct dm_target *ti, struct bio *bio)
spin_lock_irq(&lc->blocks_lock);
lc->logging_enabled = false;
spin_unlock_irq(&lc->blocks_lock);
- return -ENOMEM;
+ return DM_MAPIO_KILL;
}
src = kmap_atomic(bv.bv_page);
@@ -664,7 +664,8 @@ map_bio:
return DM_MAPIO_REMAPPED;
}
-static int normal_end_io(struct dm_target *ti, struct bio *bio, int error)
+static int normal_end_io(struct dm_target *ti, struct bio *bio,
+ blk_status_t *error)
{
struct log_writes_c *lc = ti->private;
struct per_bio_data *pb = dm_per_bio_data(bio, sizeof(struct per_bio_data));
@@ -686,7 +687,7 @@ static int normal_end_io(struct dm_target *ti, struct bio *bio, int error)
spin_unlock_irqrestore(&lc->blocks_lock, flags);
}
- return error;
+ return DM_ENDIO_DONE;
}
/*
diff --git a/drivers/md/dm-mpath.c b/drivers/md/dm-mpath.c
index 3570bcb7a4a4..0e8ab5bb3575 100644
--- a/drivers/md/dm-mpath.c
+++ b/drivers/md/dm-mpath.c
@@ -90,13 +90,7 @@ struct multipath {
atomic_t pg_init_in_progress; /* Only one pg_init allowed at once */
atomic_t pg_init_count; /* Number of times pg_init called */
- unsigned queue_mode;
-
- /*
- * We must use a mempool of dm_mpath_io structs so that we
- * can resubmit bios on error.
- */
- mempool_t *mpio_pool;
+ enum dm_queue_mode queue_mode;
struct mutex work_mutex;
struct work_struct trigger_event;
@@ -115,11 +109,10 @@ struct dm_mpath_io {
typedef int (*action_fn) (struct pgpath *pgpath);
-static struct kmem_cache *_mpio_cache;
-
static struct workqueue_struct *kmultipathd, *kmpath_handlerd;
static void trigger_event(struct work_struct *work);
-static void activate_path(struct work_struct *work);
+static void activate_or_offline_path(struct pgpath *pgpath);
+static void activate_path_work(struct work_struct *work);
static void process_queued_bios(struct work_struct *work);
/*-----------------------------------------------
@@ -144,7 +137,7 @@ static struct pgpath *alloc_pgpath(void)
if (pgpath) {
pgpath->is_active = true;
- INIT_DELAYED_WORK(&pgpath->activate_path, activate_path);
+ INIT_DELAYED_WORK(&pgpath->activate_path, activate_path_work);
}
return pgpath;
@@ -209,7 +202,6 @@ static struct multipath *alloc_multipath(struct dm_target *ti)
init_waitqueue_head(&m->pg_init_wait);
mutex_init(&m->work_mutex);
- m->mpio_pool = NULL;
m->queue_mode = DM_TYPE_NONE;
m->ti = ti;
@@ -229,16 +221,7 @@ static int alloc_multipath_stage2(struct dm_target *ti, struct multipath *m)
m->queue_mode = DM_TYPE_MQ_REQUEST_BASED;
else
m->queue_mode = DM_TYPE_REQUEST_BASED;
- }
-
- if (m->queue_mode == DM_TYPE_REQUEST_BASED) {
- unsigned min_ios = dm_get_reserved_rq_based_ios();
-
- m->mpio_pool = mempool_create_slab_pool(min_ios, _mpio_cache);
- if (!m->mpio_pool)
- return -ENOMEM;
- }
- else if (m->queue_mode == DM_TYPE_BIO_BASED) {
+ } else if (m->queue_mode == DM_TYPE_BIO_BASED) {
INIT_WORK(&m->process_queued_bios, process_queued_bios);
/*
* bio-based doesn't support any direct scsi_dh management;
@@ -263,7 +246,6 @@ static void free_multipath(struct multipath *m)
kfree(m->hw_handler_name);
kfree(m->hw_handler_params);
- mempool_destroy(m->mpio_pool);
kfree(m);
}
@@ -272,38 +254,6 @@ static struct dm_mpath_io *get_mpio(union map_info *info)
return info->ptr;
}
-static struct dm_mpath_io *set_mpio(struct multipath *m, union map_info *info)
-{
- struct dm_mpath_io *mpio;
-
- if (!m->mpio_pool) {
- /* Use blk-mq pdu memory requested via per_io_data_size */
- mpio = get_mpio(info);
- memset(mpio, 0, sizeof(*mpio));
- return mpio;
- }
-
- mpio = mempool_alloc(m->mpio_pool, GFP_ATOMIC);
- if (!mpio)
- return NULL;
-
- memset(mpio, 0, sizeof(*mpio));
- info->ptr = mpio;
-
- return mpio;
-}
-
-static void clear_request_fn_mpio(struct multipath *m, union map_info *info)
-{
- /* Only needed for non blk-mq (.request_fn) multipath */
- if (m->mpio_pool) {
- struct dm_mpath_io *mpio = info->ptr;
-
- info->ptr = NULL;
- mempool_free(mpio, m->mpio_pool);
- }
-}
-
static size_t multipath_per_bio_data_size(void)
{
return sizeof(struct dm_mpath_io) + sizeof(struct dm_bio_details);
@@ -348,6 +298,8 @@ static int __pg_init_all_paths(struct multipath *m)
struct pgpath *pgpath;
unsigned long pg_init_delay = 0;
+ lockdep_assert_held(&m->lock);
+
if (atomic_read(&m->pg_init_in_progress) || test_bit(MPATHF_PG_INIT_DISABLED, &m->flags))
return 0;
@@ -372,13 +324,16 @@ static int __pg_init_all_paths(struct multipath *m)
return atomic_read(&m->pg_init_in_progress);
}
-static void pg_init_all_paths(struct multipath *m)
+static int pg_init_all_paths(struct multipath *m)
{
+ int ret;
unsigned long flags;
spin_lock_irqsave(&m->lock, flags);
- __pg_init_all_paths(m);
+ ret = __pg_init_all_paths(m);
spin_unlock_irqrestore(&m->lock, flags);
+
+ return ret;
}
static void __switch_pg(struct multipath *m, struct priority_group *pg)
@@ -487,59 +442,35 @@ failed:
}
/*
- * Check whether bios must be queued in the device-mapper core rather
- * than here in the target.
- *
- * If m->queue_if_no_path and m->saved_queue_if_no_path hold the
- * same value then we are not between multipath_presuspend()
- * and multipath_resume() calls and we have no need to check
- * for the DMF_NOFLUSH_SUSPENDING flag.
+ * dm_report_EIO() is a macro instead of a function to make pr_debug()
+ * report the function name and line number of the function from which
+ * it has been invoked.
*/
-static bool __must_push_back(struct multipath *m)
-{
- return ((test_bit(MPATHF_QUEUE_IF_NO_PATH, &m->flags) !=
- test_bit(MPATHF_SAVED_QUEUE_IF_NO_PATH, &m->flags)) &&
- dm_noflush_suspending(m->ti));
-}
-
-static bool must_push_back_rq(struct multipath *m)
-{
- bool r;
- unsigned long flags;
-
- spin_lock_irqsave(&m->lock, flags);
- r = (test_bit(MPATHF_QUEUE_IF_NO_PATH, &m->flags) ||
- __must_push_back(m));
- spin_unlock_irqrestore(&m->lock, flags);
-
- return r;
-}
-
-static bool must_push_back_bio(struct multipath *m)
-{
- bool r;
- unsigned long flags;
-
- spin_lock_irqsave(&m->lock, flags);
- r = __must_push_back(m);
- spin_unlock_irqrestore(&m->lock, flags);
-
- return r;
-}
+#define dm_report_EIO(m) \
+do { \
+ struct mapped_device *md = dm_table_get_md((m)->ti->table); \
+ \
+ pr_debug("%s: returning EIO; QIFNP = %d; SQIFNP = %d; DNFS = %d\n", \
+ dm_device_name(md), \
+ test_bit(MPATHF_QUEUE_IF_NO_PATH, &(m)->flags), \
+ test_bit(MPATHF_SAVED_QUEUE_IF_NO_PATH, &(m)->flags), \
+ dm_noflush_suspending((m)->ti)); \
+} while (0)
/*
* Map cloned requests (request-based multipath)
*/
-static int __multipath_map(struct dm_target *ti, struct request *clone,
- union map_info *map_context,
- struct request *rq, struct request **__clone)
+static int multipath_clone_and_map(struct dm_target *ti, struct request *rq,
+ union map_info *map_context,
+ struct request **__clone)
{
struct multipath *m = ti->private;
- int r = DM_MAPIO_REQUEUE;
- size_t nr_bytes = clone ? blk_rq_bytes(clone) : blk_rq_bytes(rq);
+ size_t nr_bytes = blk_rq_bytes(rq);
struct pgpath *pgpath;
struct block_device *bdev;
- struct dm_mpath_io *mpio;
+ struct dm_mpath_io *mpio = get_mpio(map_context);
+ struct request_queue *q;
+ struct request *clone;
/* Do we need to select a new pgpath? */
pgpath = lockless_dereference(m->current_pgpath);
@@ -547,51 +478,40 @@ static int __multipath_map(struct dm_target *ti, struct request *clone,
pgpath = choose_pgpath(m, nr_bytes);
if (!pgpath) {
- if (must_push_back_rq(m))
+ if (test_bit(MPATHF_QUEUE_IF_NO_PATH, &m->flags))
return DM_MAPIO_DELAY_REQUEUE;
- return -EIO; /* Failed */
+ dm_report_EIO(m); /* Failed */
+ return DM_MAPIO_KILL;
} else if (test_bit(MPATHF_QUEUE_IO, &m->flags) ||
test_bit(MPATHF_PG_INIT_REQUIRED, &m->flags)) {
- pg_init_all_paths(m);
- return r;
+ if (pg_init_all_paths(m))
+ return DM_MAPIO_DELAY_REQUEUE;
+ return DM_MAPIO_REQUEUE;
}
- mpio = set_mpio(m, map_context);
- if (!mpio)
- /* ENOMEM, requeue */
- return r;
-
+ memset(mpio, 0, sizeof(*mpio));
mpio->pgpath = pgpath;
mpio->nr_bytes = nr_bytes;
bdev = pgpath->path.dev->bdev;
-
- if (clone) {
- /*
- * Old request-based interface: allocated clone is passed in.
- * Used by: .request_fn stacked on .request_fn path(s).
- */
- clone->q = bdev_get_queue(bdev);
- clone->rq_disk = bdev->bd_disk;
- clone->cmd_flags |= REQ_FAILFAST_TRANSPORT;
- } else {
- /*
- * blk-mq request-based interface; used by both:
- * .request_fn stacked on blk-mq path(s) and
- * blk-mq stacked on blk-mq path(s).
- */
- clone = blk_mq_alloc_request(bdev_get_queue(bdev),
- rq_data_dir(rq), BLK_MQ_REQ_NOWAIT);
- if (IS_ERR(clone)) {
- /* EBUSY, ENODEV or EWOULDBLOCK: requeue */
- clear_request_fn_mpio(m, map_context);
- return r;
+ q = bdev_get_queue(bdev);
+ clone = blk_get_request(q, rq->cmd_flags | REQ_NOMERGE, GFP_ATOMIC);
+ if (IS_ERR(clone)) {
+ /* EBUSY, ENODEV or EWOULDBLOCK: requeue */
+ bool queue_dying = blk_queue_dying(q);
+ DMERR_LIMIT("blk_get_request() returned %ld%s - requeuing",
+ PTR_ERR(clone), queue_dying ? " (path offline)" : "");
+ if (queue_dying) {
+ atomic_inc(&m->pg_init_in_progress);
+ activate_or_offline_path(pgpath);
+ return DM_MAPIO_REQUEUE;
}
- clone->bio = clone->biotail = NULL;
- clone->rq_disk = bdev->bd_disk;
- clone->cmd_flags |= REQ_FAILFAST_TRANSPORT;
- *__clone = clone;
+ return DM_MAPIO_DELAY_REQUEUE;
}
+ clone->bio = clone->biotail = NULL;
+ clone->rq_disk = bdev->bd_disk;
+ clone->cmd_flags |= REQ_FAILFAST_TRANSPORT;
+ *__clone = clone;
if (pgpath->pg->ps.type->start_io)
pgpath->pg->ps.type->start_io(&pgpath->pg->ps,
@@ -600,22 +520,9 @@ static int __multipath_map(struct dm_target *ti, struct request *clone,
return DM_MAPIO_REMAPPED;
}
-static int multipath_map(struct dm_target *ti, struct request *clone,
- union map_info *map_context)
-{
- return __multipath_map(ti, clone, map_context, NULL, NULL);
-}
-
-static int multipath_clone_and_map(struct dm_target *ti, struct request *rq,
- union map_info *map_context,
- struct request **clone)
-{
- return __multipath_map(ti, NULL, map_context, rq, clone);
-}
-
static void multipath_release_clone(struct request *clone)
{
- blk_mq_free_request(clone);
+ blk_put_request(clone);
}
/*
@@ -649,15 +556,16 @@ static int __multipath_map_bio(struct multipath *m, struct bio *bio, struct dm_m
}
if (!pgpath) {
- if (!must_push_back_bio(m))
- return -EIO;
- return DM_MAPIO_REQUEUE;
+ if (test_bit(MPATHF_QUEUE_IF_NO_PATH, &m->flags))
+ return DM_MAPIO_REQUEUE;
+ dm_report_EIO(m);
+ return DM_MAPIO_KILL;
}
mpio->pgpath = pgpath;
mpio->nr_bytes = nr_bytes;
- bio->bi_error = 0;
+ bio->bi_status = 0;
bio->bi_bdev = pgpath->path.dev->bdev;
bio->bi_opf |= REQ_FAILFAST_TRANSPORT;
@@ -713,15 +621,31 @@ static void process_queued_bios(struct work_struct *work)
blk_start_plug(&plug);
while ((bio = bio_list_pop(&bios))) {
r = __multipath_map_bio(m, bio, get_mpio_from_bio(bio));
- if (r < 0 || r == DM_MAPIO_REQUEUE) {
- bio->bi_error = r;
+ switch (r) {
+ case DM_MAPIO_KILL:
+ bio->bi_status = BLK_STS_IOERR;
+ bio_endio(bio);
+ break;
+ case DM_MAPIO_REQUEUE:
+ bio->bi_status = BLK_STS_DM_REQUEUE;
bio_endio(bio);
- } else if (r == DM_MAPIO_REMAPPED)
+ break;
+ case DM_MAPIO_REMAPPED:
generic_make_request(bio);
+ break;
+ }
}
blk_finish_plug(&plug);
}
+static void assign_bit(bool value, long nr, unsigned long *addr)
+{
+ if (value)
+ set_bit(nr, addr);
+ else
+ clear_bit(nr, addr);
+}
+
/*
* If we run out of usable paths, should we queue I/O or error it?
*/
@@ -731,23 +655,11 @@ static int queue_if_no_path(struct multipath *m, bool queue_if_no_path,
unsigned long flags;
spin_lock_irqsave(&m->lock, flags);
-
- if (save_old_value) {
- if (test_bit(MPATHF_QUEUE_IF_NO_PATH, &m->flags))
- set_bit(MPATHF_SAVED_QUEUE_IF_NO_PATH, &m->flags);
- else
- clear_bit(MPATHF_SAVED_QUEUE_IF_NO_PATH, &m->flags);
- } else {
- if (queue_if_no_path)
- set_bit(MPATHF_SAVED_QUEUE_IF_NO_PATH, &m->flags);
- else
- clear_bit(MPATHF_SAVED_QUEUE_IF_NO_PATH, &m->flags);
- }
- if (queue_if_no_path)
- set_bit(MPATHF_QUEUE_IF_NO_PATH, &m->flags);
- else
- clear_bit(MPATHF_QUEUE_IF_NO_PATH, &m->flags);
-
+ assign_bit((save_old_value && test_bit(MPATHF_QUEUE_IF_NO_PATH, &m->flags)) ||
+ (!save_old_value && queue_if_no_path),
+ MPATHF_SAVED_QUEUE_IF_NO_PATH, &m->flags);
+ assign_bit(queue_if_no_path || dm_noflush_suspending(m->ti),
+ MPATHF_QUEUE_IF_NO_PATH, &m->flags);
spin_unlock_irqrestore(&m->lock, flags);
if (!queue_if_no_path) {
@@ -1185,9 +1097,10 @@ static int multipath_ctr(struct dm_target *ti, unsigned argc, char **argv)
ti->num_flush_bios = 1;
ti->num_discard_bios = 1;
ti->num_write_same_bios = 1;
+ ti->num_write_zeroes_bios = 1;
if (m->queue_mode == DM_TYPE_BIO_BASED)
ti->per_io_data_size = multipath_per_bio_data_size();
- else if (m->queue_mode == DM_TYPE_MQ_REQUEST_BASED)
+ else
ti->per_io_data_size = sizeof(struct dm_mpath_io);
return 0;
@@ -1519,10 +1432,8 @@ out:
spin_unlock_irqrestore(&m->lock, flags);
}
-static void activate_path(struct work_struct *work)
+static void activate_or_offline_path(struct pgpath *pgpath)
{
- struct pgpath *pgpath =
- container_of(work, struct pgpath, activate_path.work);
struct request_queue *q = bdev_get_queue(pgpath->path.dev->bdev);
if (pgpath->is_active && !blk_queue_dying(q))
@@ -1531,22 +1442,23 @@ static void activate_path(struct work_struct *work)
pg_init_done(pgpath, SCSI_DH_DEV_OFFLINED);
}
-static int noretry_error(int error)
+static void activate_path_work(struct work_struct *work)
+{
+ struct pgpath *pgpath =
+ container_of(work, struct pgpath, activate_path.work);
+
+ activate_or_offline_path(pgpath);
+}
+
+static int noretry_error(blk_status_t error)
{
switch (error) {
- case -EBADE:
- /*
- * EBADE signals an reservation conflict.
- * We shouldn't fail the path here as we can communicate with
- * the target. We should failover to the next path, but in
- * doing so we might be causing a ping-pong between paths.
- * So just return the reservation conflict error.
- */
- case -EOPNOTSUPP:
- case -EREMOTEIO:
- case -EILSEQ:
- case -ENODATA:
- case -ENOSPC:
+ case BLK_STS_NOTSUPP:
+ case BLK_STS_NOSPC:
+ case BLK_STS_TARGET:
+ case BLK_STS_NEXUS:
+ case BLK_STS_MEDIUM:
+ case BLK_STS_RESOURCE:
return 1;
}
@@ -1554,12 +1466,13 @@ static int noretry_error(int error)
return 0;
}
-/*
- * end_io handling
- */
-static int do_end_io(struct multipath *m, struct request *clone,
- int error, struct dm_mpath_io *mpio)
+static int multipath_end_io(struct dm_target *ti, struct request *clone,
+ blk_status_t error, union map_info *map_context)
{
+ struct dm_mpath_io *mpio = get_mpio(map_context);
+ struct pgpath *pgpath = mpio->pgpath;
+ int r = DM_ENDIO_DONE;
+
/*
* We don't queue any clone request inside the multipath target
* during end I/O handling, since those clone requests don't have
@@ -1571,70 +1484,53 @@ static int do_end_io(struct multipath *m, struct request *clone,
* request into dm core, which will remake a clone request and
* clone bios for it and resubmit it later.
*/
- int r = DM_ENDIO_REQUEUE;
+ if (error && !noretry_error(error)) {
+ struct multipath *m = ti->private;
- if (!error && !clone->errors)
- return 0; /* I/O complete */
+ r = DM_ENDIO_REQUEUE;
- if (noretry_error(error))
- return error;
-
- if (mpio->pgpath)
- fail_path(mpio->pgpath);
+ if (pgpath)
+ fail_path(pgpath);
- if (!atomic_read(&m->nr_valid_paths)) {
- if (!test_bit(MPATHF_QUEUE_IF_NO_PATH, &m->flags)) {
- if (!must_push_back_rq(m))
- r = -EIO;
+ if (atomic_read(&m->nr_valid_paths) == 0 &&
+ !test_bit(MPATHF_QUEUE_IF_NO_PATH, &m->flags)) {
+ if (error == BLK_STS_IOERR)
+ dm_report_EIO(m);
+ /* complete with the original error */
+ r = DM_ENDIO_DONE;
}
}
- return r;
-}
-
-static int multipath_end_io(struct dm_target *ti, struct request *clone,
- int error, union map_info *map_context)
-{
- struct multipath *m = ti->private;
- struct dm_mpath_io *mpio = get_mpio(map_context);
- struct pgpath *pgpath;
- struct path_selector *ps;
- int r;
-
- BUG_ON(!mpio);
-
- r = do_end_io(m, clone, error, mpio);
- pgpath = mpio->pgpath;
if (pgpath) {
- ps = &pgpath->pg->ps;
+ struct path_selector *ps = &pgpath->pg->ps;
+
if (ps->type->end_io)
ps->type->end_io(ps, &pgpath->path, mpio->nr_bytes);
}
- clear_request_fn_mpio(m, map_context);
return r;
}
-static int do_end_io_bio(struct multipath *m, struct bio *clone,
- int error, struct dm_mpath_io *mpio)
+static int multipath_end_io_bio(struct dm_target *ti, struct bio *clone,
+ blk_status_t *error)
{
+ struct multipath *m = ti->private;
+ struct dm_mpath_io *mpio = get_mpio_from_bio(clone);
+ struct pgpath *pgpath = mpio->pgpath;
unsigned long flags;
+ int r = DM_ENDIO_DONE;
- if (!error)
- return 0; /* I/O complete */
-
- if (noretry_error(error))
- return error;
+ if (!*error || noretry_error(*error))
+ goto done;
- if (mpio->pgpath)
- fail_path(mpio->pgpath);
+ if (pgpath)
+ fail_path(pgpath);
- if (!atomic_read(&m->nr_valid_paths)) {
- if (!test_bit(MPATHF_QUEUE_IF_NO_PATH, &m->flags)) {
- if (!must_push_back_bio(m))
- return -EIO;
- return DM_ENDIO_REQUEUE;
- }
+ if (atomic_read(&m->nr_valid_paths) == 0 &&
+ !test_bit(MPATHF_QUEUE_IF_NO_PATH, &m->flags)) {
+ dm_report_EIO(m);
+ *error = BLK_STS_IOERR;
+ goto done;
}
/* Queue for the daemon to resubmit */
@@ -1646,23 +1542,11 @@ static int do_end_io_bio(struct multipath *m, struct bio *clone,
if (!test_bit(MPATHF_QUEUE_IO, &m->flags))
queue_work(kmultipathd, &m->process_queued_bios);
- return DM_ENDIO_INCOMPLETE;
-}
-
-static int multipath_end_io_bio(struct dm_target *ti, struct bio *clone, int error)
-{
- struct multipath *m = ti->private;
- struct dm_mpath_io *mpio = get_mpio_from_bio(clone);
- struct pgpath *pgpath;
- struct path_selector *ps;
- int r;
-
- BUG_ON(!mpio);
-
- r = do_end_io_bio(m, clone, error, mpio);
- pgpath = mpio->pgpath;
+ r = DM_ENDIO_INCOMPLETE;
+done:
if (pgpath) {
- ps = &pgpath->pg->ps;
+ struct path_selector *ps = &pgpath->pg->ps;
+
if (ps->type->end_io)
ps->type->end_io(ps, &pgpath->path, mpio->nr_bytes);
}
@@ -1701,10 +1585,8 @@ static void multipath_resume(struct dm_target *ti)
unsigned long flags;
spin_lock_irqsave(&m->lock, flags);
- if (test_bit(MPATHF_SAVED_QUEUE_IF_NO_PATH, &m->flags))
- set_bit(MPATHF_QUEUE_IF_NO_PATH, &m->flags);
- else
- clear_bit(MPATHF_QUEUE_IF_NO_PATH, &m->flags);
+ assign_bit(test_bit(MPATHF_SAVED_QUEUE_IF_NO_PATH, &m->flags),
+ MPATHF_QUEUE_IF_NO_PATH, &m->flags);
spin_unlock_irqrestore(&m->lock, flags);
}
@@ -1764,6 +1646,9 @@ static void multipath_status(struct dm_target *ti, status_type_t type,
case DM_TYPE_MQ_REQUEST_BASED:
DMEMIT("queue_mode mq ");
break;
+ default:
+ WARN_ON_ONCE(true);
+ break;
}
}
}
@@ -2060,7 +1945,6 @@ static struct target_type multipath_target = {
.module = THIS_MODULE,
.ctr = multipath_ctr,
.dtr = multipath_dtr,
- .map_rq = multipath_map,
.clone_and_map_rq = multipath_clone_and_map,
.release_clone_rq = multipath_release_clone,
.rq_end_io = multipath_end_io,
@@ -2080,11 +1964,6 @@ static int __init dm_multipath_init(void)
{
int r;
- /* allocate a slab for the dm_mpath_ios */
- _mpio_cache = KMEM_CACHE(dm_mpath_io, 0);
- if (!_mpio_cache)
- return -ENOMEM;
-
r = dm_register_target(&multipath_target);
if (r < 0) {
DMERR("request-based register failed %d", r);
@@ -2120,8 +1999,6 @@ bad_alloc_kmpath_handlerd:
bad_alloc_kmultipathd:
dm_unregister_target(&multipath_target);
bad_register_target:
- kmem_cache_destroy(_mpio_cache);
-
return r;
}
@@ -2131,7 +2008,6 @@ static void __exit dm_multipath_exit(void)
destroy_workqueue(kmultipathd);
dm_unregister_target(&multipath_target);
- kmem_cache_destroy(_mpio_cache);
}
module_init(dm_multipath_init);
diff --git a/drivers/md/dm-raid.c b/drivers/md/dm-raid.c
index b8f978e551d7..2e10c2f13a34 100644
--- a/drivers/md/dm-raid.c
+++ b/drivers/md/dm-raid.c
@@ -1,6 +1,6 @@
/*
* Copyright (C) 2010-2011 Neil Brown
- * Copyright (C) 2010-2016 Red Hat, Inc. All rights reserved.
+ * Copyright (C) 2010-2017 Red Hat, Inc. All rights reserved.
*
* This file is released under the GPL.
*/
@@ -24,6 +24,11 @@
*/
#define MIN_FREE_RESHAPE_SPACE to_sector(4*4096)
+/*
+ * Minimum journal space 4 MiB in sectors.
+ */
+#define MIN_RAID456_JOURNAL_SPACE (4*2048)
+
static bool devices_handle_discard_safely = false;
/*
@@ -73,6 +78,12 @@ struct raid_dev {
#define __CTR_FLAG_DATA_OFFSET 13 /* 2 */ /* Only with reshapable raid4/5/6/10! */
#define __CTR_FLAG_RAID10_USE_NEAR_SETS 14 /* 2 */ /* Only with raid10! */
+/* New for v1.10.0 */
+#define __CTR_FLAG_JOURNAL_DEV 15 /* 2 */ /* Only with raid4/5/6 (journal device)! */
+
+/* New for v1.11.1 */
+#define __CTR_FLAG_JOURNAL_MODE 16 /* 2 */ /* Only with raid4/5/6 (journal mode)! */
+
/*
* Flags for rs->ctr_flags field.
*/
@@ -91,6 +102,10 @@ struct raid_dev {
#define CTR_FLAG_DELTA_DISKS (1 << __CTR_FLAG_DELTA_DISKS)
#define CTR_FLAG_DATA_OFFSET (1 << __CTR_FLAG_DATA_OFFSET)
#define CTR_FLAG_RAID10_USE_NEAR_SETS (1 << __CTR_FLAG_RAID10_USE_NEAR_SETS)
+#define CTR_FLAG_JOURNAL_DEV (1 << __CTR_FLAG_JOURNAL_DEV)
+#define CTR_FLAG_JOURNAL_MODE (1 << __CTR_FLAG_JOURNAL_MODE)
+
+#define RESUME_STAY_FROZEN_FLAGS (CTR_FLAG_DELTA_DISKS | CTR_FLAG_DATA_OFFSET)
/*
* Definitions of various constructor flags to
@@ -163,7 +178,9 @@ struct raid_dev {
CTR_FLAG_STRIPE_CACHE | \
CTR_FLAG_REGION_SIZE | \
CTR_FLAG_DELTA_DISKS | \
- CTR_FLAG_DATA_OFFSET)
+ CTR_FLAG_DATA_OFFSET | \
+ CTR_FLAG_JOURNAL_DEV | \
+ CTR_FLAG_JOURNAL_MODE)
#define RAID6_VALID_FLAGS (CTR_FLAG_SYNC | \
CTR_FLAG_REBUILD | \
@@ -173,7 +190,9 @@ struct raid_dev {
CTR_FLAG_STRIPE_CACHE | \
CTR_FLAG_REGION_SIZE | \
CTR_FLAG_DELTA_DISKS | \
- CTR_FLAG_DATA_OFFSET)
+ CTR_FLAG_DATA_OFFSET | \
+ CTR_FLAG_JOURNAL_DEV | \
+ CTR_FLAG_JOURNAL_MODE)
/* ...valid options definitions per raid level */
/*
@@ -222,6 +241,13 @@ struct raid_set {
struct raid_type *raid_type;
struct dm_target_callbacks callbacks;
+ /* Optional raid4/5/6 journal device */
+ struct journal_dev {
+ struct dm_dev *dev;
+ struct md_rdev rdev;
+ int mode;
+ } journal_dev;
+
struct raid_dev dev[0];
};
@@ -306,6 +332,8 @@ static struct arg_name_flag {
{ CTR_FLAG_DATA_OFFSET, "data_offset"},
{ CTR_FLAG_DELTA_DISKS, "delta_disks"},
{ CTR_FLAG_RAID10_USE_NEAR_SETS, "raid10_use_near_sets"},
+ { CTR_FLAG_JOURNAL_DEV, "journal_dev" },
+ { CTR_FLAG_JOURNAL_MODE, "journal_mode" },
};
/* Return argument name string for given @flag */
@@ -324,6 +352,39 @@ static const char *dm_raid_arg_name_by_flag(const uint32_t flag)
return NULL;
}
+/* Define correlation of raid456 journal cache modes and dm-raid target line parameters */
+static struct {
+ const int mode;
+ const char *param;
+} _raid456_journal_mode[] = {
+ { R5C_JOURNAL_MODE_WRITE_THROUGH , "writethrough" },
+ { R5C_JOURNAL_MODE_WRITE_BACK , "writeback" }
+};
+
+/* Return MD raid4/5/6 journal mode for dm @journal_mode one */
+static int dm_raid_journal_mode_to_md(const char *mode)
+{
+ int m = ARRAY_SIZE(_raid456_journal_mode);
+
+ while (m--)
+ if (!strcasecmp(mode, _raid456_journal_mode[m].param))
+ return _raid456_journal_mode[m].mode;
+
+ return -EINVAL;
+}
+
+/* Return dm-raid raid4/5/6 journal mode string for @mode */
+static const char *md_journal_mode_to_dm_raid(const int mode)
+{
+ int m = ARRAY_SIZE(_raid456_journal_mode);
+
+ while (m--)
+ if (mode == _raid456_journal_mode[m].mode)
+ return _raid456_journal_mode[m].param;
+
+ return "unknown";
+}
+
/*
* Bool helpers to test for various raid levels of a raid set.
* It's level as reported by the superblock rather than
@@ -370,7 +431,7 @@ static bool rs_is_reshapable(struct raid_set *rs)
/* Return true, if raid set in @rs is recovering */
static bool rs_is_recovering(struct raid_set *rs)
{
- return rs->md.recovery_cp < rs->dev[0].rdev.sectors;
+ return rs->md.recovery_cp < rs->md.dev_sectors;
}
/* Return true, if raid set in @rs is reshaping */
@@ -627,7 +688,8 @@ static void rs_set_capacity(struct raid_set *rs)
* is unintended in case of out-of-place reshaping
*/
rdev_for_each(rdev, mddev)
- rdev->sectors = mddev->dev_sectors;
+ if (!test_bit(Journal, &rdev->flags))
+ rdev->sectors = mddev->dev_sectors;
set_capacity(gendisk, mddev->array_sectors);
revalidate_disk(gendisk);
@@ -713,6 +775,11 @@ static void raid_set_free(struct raid_set *rs)
{
int i;
+ if (rs->journal_dev.dev) {
+ md_rdev_clear(&rs->journal_dev.rdev);
+ dm_put_device(rs->ti, rs->journal_dev.dev);
+ }
+
for (i = 0; i < rs->raid_disks; i++) {
if (rs->dev[i].meta_dev)
dm_put_device(rs->ti, rs->dev[i].meta_dev);
@@ -760,10 +827,11 @@ static int parse_dev_params(struct raid_set *rs, struct dm_arg_set *as)
rs->dev[i].data_dev = NULL;
/*
- * There are no offsets, since there is a separate device
- * for data and metadata.
+ * There are no offsets initially.
+ * Out of place reshape will set them accordingly.
*/
rs->dev[i].rdev.data_offset = 0;
+ rs->dev[i].rdev.new_data_offset = 0;
rs->dev[i].rdev.mddev = &rs->md;
arg = dm_shift_arg(as);
@@ -821,6 +889,9 @@ static int parse_dev_params(struct raid_set *rs, struct dm_arg_set *as)
rebuild++;
}
+ if (rs->journal_dev.dev)
+ list_add_tail(&rs->journal_dev.rdev.same_set, &rs->md.disks);
+
if (metadata_available) {
rs->md.external = 0;
rs->md.persistent = 1;
@@ -1026,6 +1097,8 @@ too_many:
* [max_write_behind <sectors>] See '-write-behind=' (man mdadm)
* [stripe_cache <sectors>] Stripe cache size for higher RAIDs
* [region_size <sectors>] Defines granularity of bitmap
+ * [journal_dev <dev>] raid4/5/6 journaling deviice
+ * (i.e. write hole closing log)
*
* RAID10-only options:
* [raid10_copies <# copies>] Number of copies. (Default: 2)
@@ -1133,7 +1206,7 @@ static int parse_raid_params(struct raid_set *rs, struct dm_arg_set *as,
/*
* Parameters that take a string value are checked here.
*/
-
+ /* "raid10_format {near|offset|far} */
if (!strcasecmp(key, dm_raid_arg_name_by_flag(CTR_FLAG_RAID10_FORMAT))) {
if (test_and_set_bit(__CTR_FLAG_RAID10_FORMAT, &rs->ctr_flags)) {
rs->ti->error = "Only one 'raid10_format' argument pair allowed";
@@ -1151,6 +1224,63 @@ static int parse_raid_params(struct raid_set *rs, struct dm_arg_set *as,
continue;
}
+ /* "journal_dev <dev>" */
+ if (!strcasecmp(key, dm_raid_arg_name_by_flag(CTR_FLAG_JOURNAL_DEV))) {
+ int r;
+ struct md_rdev *jdev;
+
+ if (test_and_set_bit(__CTR_FLAG_JOURNAL_DEV, &rs->ctr_flags)) {
+ rs->ti->error = "Only one raid4/5/6 set journaling device allowed";
+ return -EINVAL;
+ }
+ if (!rt_is_raid456(rt)) {
+ rs->ti->error = "'journal_dev' is an invalid parameter for this RAID type";
+ return -EINVAL;
+ }
+ r = dm_get_device(rs->ti, arg, dm_table_get_mode(rs->ti->table),
+ &rs->journal_dev.dev);
+ if (r) {
+ rs->ti->error = "raid4/5/6 journal device lookup failure";
+ return r;
+ }
+ jdev = &rs->journal_dev.rdev;
+ md_rdev_init(jdev);
+ jdev->mddev = &rs->md;
+ jdev->bdev = rs->journal_dev.dev->bdev;
+ jdev->sectors = to_sector(i_size_read(jdev->bdev->bd_inode));
+ if (jdev->sectors < MIN_RAID456_JOURNAL_SPACE) {
+ rs->ti->error = "No space for raid4/5/6 journal";
+ return -ENOSPC;
+ }
+ rs->journal_dev.mode = R5C_JOURNAL_MODE_WRITE_THROUGH;
+ set_bit(Journal, &jdev->flags);
+ continue;
+ }
+
+ /* "journal_mode <mode>" ("journal_dev" mandatory!) */
+ if (!strcasecmp(key, dm_raid_arg_name_by_flag(CTR_FLAG_JOURNAL_MODE))) {
+ int r;
+
+ if (!test_bit(__CTR_FLAG_JOURNAL_DEV, &rs->ctr_flags)) {
+ rs->ti->error = "raid4/5/6 'journal_mode' is invalid without 'journal_dev'";
+ return -EINVAL;
+ }
+ if (test_and_set_bit(__CTR_FLAG_JOURNAL_MODE, &rs->ctr_flags)) {
+ rs->ti->error = "Only one raid4/5/6 'journal_mode' argument allowed";
+ return -EINVAL;
+ }
+ r = dm_raid_journal_mode_to_md(arg);
+ if (r < 0) {
+ rs->ti->error = "Invalid 'journal_mode' argument";
+ return r;
+ }
+ rs->journal_dev.mode = r;
+ continue;
+ }
+
+ /*
+ * Parameters with number values from here on.
+ */
if (kstrtoint(arg, 10, &value) < 0) {
rs->ti->error = "Bad numerical argument given in raid params";
return -EINVAL;
@@ -1425,6 +1555,25 @@ static unsigned int rs_data_stripes(struct raid_set *rs)
return rs->raid_disks - rs->raid_type->parity_devs;
}
+/*
+ * Retrieve rdev->sectors from any valid raid device of @rs
+ * to allow userpace to pass in arbitray "- -" device tupples.
+ */
+static sector_t __rdev_sectors(struct raid_set *rs)
+{
+ int i;
+
+ for (i = 0; i < rs->md.raid_disks; i++) {
+ struct md_rdev *rdev = &rs->dev[i].rdev;
+
+ if (!test_bit(Journal, &rdev->flags) &&
+ rdev->bdev && rdev->sectors)
+ return rdev->sectors;
+ }
+
+ return 0;
+}
+
/* Calculate the sectors per device and per array used for @rs */
static int rs_set_dev_and_array_sectors(struct raid_set *rs, bool use_mddev)
{
@@ -1468,7 +1617,8 @@ static int rs_set_dev_and_array_sectors(struct raid_set *rs, bool use_mddev)
array_sectors = (data_stripes + delta_disks) * dev_sectors;
rdev_for_each(rdev, mddev)
- rdev->sectors = dev_sectors;
+ if (!test_bit(Journal, &rdev->flags))
+ rdev->sectors = dev_sectors;
mddev->array_sectors = array_sectors;
mddev->dev_sectors = dev_sectors;
@@ -1510,9 +1660,9 @@ static void rs_setup_recovery(struct raid_set *rs, sector_t dev_sectors)
else if (dev_sectors == MaxSector)
/* Prevent recovery */
__rs_setup_recovery(rs, MaxSector);
- else if (rs->dev[0].rdev.sectors < dev_sectors)
+ else if (__rdev_sectors(rs) < dev_sectors)
/* Grown raid set */
- __rs_setup_recovery(rs, rs->dev[0].rdev.sectors);
+ __rs_setup_recovery(rs, __rdev_sectors(rs));
else
__rs_setup_recovery(rs, MaxSector);
}
@@ -1777,7 +1927,7 @@ struct dm_raid_superblock {
/********************************************************************
* BELOW FOLLOW V1.9.0 EXTENSIONS TO THE PRISTINE SUPERBLOCK FORMAT!!!
*
- * FEATURE_FLAG_SUPPORTS_V190 in the features member indicates that those exist
+ * FEATURE_FLAG_SUPPORTS_V190 in the compat_features member indicates that those exist
*/
__le32 flags; /* Flags defining array states for reshaping */
@@ -1851,18 +2001,21 @@ static int rs_check_reshape(struct raid_set *rs)
return -EPERM;
}
-static int read_disk_sb(struct md_rdev *rdev, int size)
+static int read_disk_sb(struct md_rdev *rdev, int size, bool force_reload)
{
BUG_ON(!rdev->sb_page);
- if (rdev->sb_loaded)
+ if (rdev->sb_loaded && !force_reload)
return 0;
+ rdev->sb_loaded = 0;
+
if (!sync_page_io(rdev, 0, size, rdev->sb_page, REQ_OP_READ, 0, true)) {
DMERR("Failed to read superblock of device at position %d",
rdev->raid_disk);
md_error(rdev->mddev, rdev);
- return -EINVAL;
+ set_bit(Faulty, &rdev->flags);
+ return -EIO;
}
rdev->sb_loaded = 1;
@@ -1939,6 +2092,11 @@ static void super_sync(struct mddev *mddev, struct md_rdev *rdev)
sb->layout = cpu_to_le32(mddev->layout);
sb->stripe_sectors = cpu_to_le32(mddev->chunk_sectors);
+ /********************************************************************
+ * BELOW FOLLOW V1.9.0 EXTENSIONS TO THE PRISTINE SUPERBLOCK FORMAT!!!
+ *
+ * FEATURE_FLAG_SUPPORTS_V190 in the compat_features member indicates that those exist
+ */
sb->new_level = cpu_to_le32(mddev->new_level);
sb->new_layout = cpu_to_le32(mddev->new_layout);
sb->new_stripe_sectors = cpu_to_le32(mddev->new_chunk_sectors);
@@ -1990,7 +2148,7 @@ static int super_load(struct md_rdev *rdev, struct md_rdev *refdev)
return -EINVAL;
}
- r = read_disk_sb(rdev, rdev->sb_size);
+ r = read_disk_sb(rdev, rdev->sb_size, false);
if (r)
return r;
@@ -2146,6 +2304,9 @@ static int super_init_validation(struct raid_set *rs, struct md_rdev *rdev)
*/
d = 0;
rdev_for_each(r, mddev) {
+ if (test_bit(Journal, &rdev->flags))
+ continue;
+
if (test_bit(FirstUse, &r->flags))
new_devs++;
@@ -2201,7 +2362,8 @@ static int super_init_validation(struct raid_set *rs, struct md_rdev *rdev)
*/
sb_retrieve_failed_devices(sb, failed_devices);
rdev_for_each(r, mddev) {
- if (!r->sb_page)
+ if (test_bit(Journal, &rdev->flags) ||
+ !r->sb_page)
continue;
sb2 = page_address(r->sb_page);
sb2->failed_devices = 0;
@@ -2253,7 +2415,7 @@ static int super_validate(struct raid_set *rs, struct md_rdev *rdev)
struct mddev *mddev = &rs->md;
struct dm_raid_superblock *sb;
- if (rs_is_raid0(rs) || !rdev->sb_page)
+ if (rs_is_raid0(rs) || !rdev->sb_page || rdev->raid_disk < 0)
return 0;
sb = page_address(rdev->sb_page);
@@ -2278,11 +2440,17 @@ static int super_validate(struct raid_set *rs, struct md_rdev *rdev)
/* Enable bitmap creation for RAID levels != 0 */
mddev->bitmap_info.offset = rt_is_raid0(rs->raid_type) ? 0 : to_sector(4096);
- rdev->mddev->bitmap_info.default_offset = mddev->bitmap_info.offset;
+ mddev->bitmap_info.default_offset = mddev->bitmap_info.offset;
if (!test_and_clear_bit(FirstUse, &rdev->flags)) {
- /* Retrieve device size stored in superblock to be prepared for shrink */
- rdev->sectors = le64_to_cpu(sb->sectors);
+ /*
+ * Retrieve rdev size stored in superblock to be prepared for shrink.
+ * Check extended superblock members are present otherwise the size
+ * will not be set!
+ */
+ if (le32_to_cpu(sb->compat_features) & FEATURE_FLAG_SUPPORTS_V190)
+ rdev->sectors = le64_to_cpu(sb->sectors);
+
rdev->recovery_offset = le64_to_cpu(sb->disk_recovery_offset);
if (rdev->recovery_offset == MaxSector)
set_bit(In_sync, &rdev->flags);
@@ -2316,21 +2484,22 @@ static int super_validate(struct raid_set *rs, struct md_rdev *rdev)
static int analyse_superblocks(struct dm_target *ti, struct raid_set *rs)
{
int r;
- struct raid_dev *dev;
- struct md_rdev *rdev, *tmp, *freshest;
+ struct md_rdev *rdev, *freshest;
struct mddev *mddev = &rs->md;
freshest = NULL;
- rdev_for_each_safe(rdev, tmp, mddev) {
+ rdev_for_each(rdev, mddev) {
+ if (test_bit(Journal, &rdev->flags))
+ continue;
+
/*
* Skipping super_load due to CTR_FLAG_SYNC will cause
* the array to undergo initialization again as
* though it were new. This is the intended effect
* of the "sync" directive.
*
- * When reshaping capability is added, we must ensure
- * that the "sync" directive is disallowed during the
- * reshape.
+ * With reshaping capability added, we must ensure that
+ * that the "sync" directive is disallowed during the reshape.
*/
if (test_bit(__CTR_FLAG_SYNC, &rs->ctr_flags))
continue;
@@ -2347,6 +2516,7 @@ static int analyse_superblocks(struct dm_target *ti, struct raid_set *rs)
case 0:
break;
default:
+ /* This is a failure to read the superblock from the metadata device. */
/*
* We have to keep any raid0 data/metadata device pairs or
* the MD raid0 personality will fail to start the array.
@@ -2354,33 +2524,16 @@ static int analyse_superblocks(struct dm_target *ti, struct raid_set *rs)
if (rs_is_raid0(rs))
continue;
- dev = container_of(rdev, struct raid_dev, rdev);
- if (dev->meta_dev)
- dm_put_device(ti, dev->meta_dev);
-
- dev->meta_dev = NULL;
- rdev->meta_bdev = NULL;
-
- if (rdev->sb_page)
- put_page(rdev->sb_page);
-
- rdev->sb_page = NULL;
-
- rdev->sb_loaded = 0;
-
/*
- * We might be able to salvage the data device
- * even though the meta device has failed. For
- * now, we behave as though '- -' had been
- * set for this device in the table.
+ * We keep the dm_devs to be able to emit the device tuple
+ * properly on the table line in raid_status() (rather than
+ * mistakenly acting as if '- -' got passed into the constructor).
+ *
+ * The rdev has to stay on the same_set list to allow for
+ * the attempt to restore faulty devices on second resume.
*/
- if (dev->data_dev)
- dm_put_device(ti, dev->data_dev);
-
- dev->data_dev = NULL;
- rdev->bdev = NULL;
-
- list_del(&rdev->same_set);
+ rdev->raid_disk = rdev->saved_raid_disk = -1;
+ break;
}
}
@@ -2401,7 +2554,9 @@ static int analyse_superblocks(struct dm_target *ti, struct raid_set *rs)
return -EINVAL;
rdev_for_each(rdev, mddev)
- if ((rdev != freshest) && super_validate(rs, rdev))
+ if (!test_bit(Journal, &rdev->flags) &&
+ rdev != freshest &&
+ super_validate(rs, rdev))
return -EINVAL;
return 0;
}
@@ -2488,10 +2643,12 @@ static int rs_adjust_data_offsets(struct raid_set *rs)
return -ENOSPC;
}
out:
- /* Adjust data offsets on all rdevs */
+ /* Adjust data offsets on all rdevs but on any raid4/5/6 journal device */
rdev_for_each(rdev, &rs->md) {
- rdev->data_offset = data_offset;
- rdev->new_data_offset = new_data_offset;
+ if (!test_bit(Journal, &rdev->flags)) {
+ rdev->data_offset = data_offset;
+ rdev->new_data_offset = new_data_offset;
+ }
}
return 0;
@@ -2504,8 +2661,10 @@ static void __reorder_raid_disk_indexes(struct raid_set *rs)
struct md_rdev *rdev;
rdev_for_each(rdev, &rs->md) {
- rdev->raid_disk = i++;
- rdev->saved_raid_disk = rdev->new_raid_disk = -1;
+ if (!test_bit(Journal, &rdev->flags)) {
+ rdev->raid_disk = i++;
+ rdev->saved_raid_disk = rdev->new_raid_disk = -1;
+ }
}
}
@@ -2728,7 +2887,9 @@ static void configure_discard_support(struct raid_set *rs)
/* Assume discards not supported until after checks below. */
ti->discards_supported = false;
- /* RAID level 4,5,6 require discard_zeroes_data for data integrity! */
+ /*
+ * XXX: RAID level 4,5,6 require zeroing for safety.
+ */
raid456 = (rs->md.level == 4 || rs->md.level == 5 || rs->md.level == 6);
for (i = 0; i < rs->raid_disks; i++) {
@@ -2742,8 +2903,6 @@ static void configure_discard_support(struct raid_set *rs)
return;
if (raid456) {
- if (!q->limits.discard_zeroes_data)
- return;
if (!devices_handle_discard_safely) {
DMERR("raid456 discard support disabled due to discard_zeroes_data uncertainty.");
DMERR("Set dm-raid.devices_handle_discard_safely=Y to override.");
@@ -2782,7 +2941,7 @@ static int raid_ctr(struct dm_target *ti, unsigned int argc, char **argv)
bool resize;
struct raid_type *rt;
unsigned int num_raid_params, num_raid_devs;
- sector_t calculated_dev_sectors;
+ sector_t calculated_dev_sectors, rdev_sectors;
struct raid_set *rs = NULL;
const char *arg;
struct rs_layout rs_layout;
@@ -2845,7 +3004,7 @@ static int raid_ctr(struct dm_target *ti, unsigned int argc, char **argv)
if (r)
goto bad;
- calculated_dev_sectors = rs->dev[0].rdev.sectors;
+ calculated_dev_sectors = rs->md.dev_sectors;
/*
* Backup any new raid set level, layout, ...
@@ -2858,7 +3017,14 @@ static int raid_ctr(struct dm_target *ti, unsigned int argc, char **argv)
if (r)
goto bad;
- resize = calculated_dev_sectors != rs->dev[0].rdev.sectors;
+ rdev_sectors = __rdev_sectors(rs);
+ if (!rdev_sectors) {
+ ti->error = "Invalid rdev size";
+ r = -EINVAL;
+ goto bad;
+ }
+
+ resize = calculated_dev_sectors != rdev_sectors;
INIT_WORK(&rs->md.event_work, do_table_event);
ti->private = rs;
@@ -2902,6 +3068,13 @@ static int raid_ctr(struct dm_target *ti, unsigned int argc, char **argv)
goto bad;
}
+ /* We can't takeover a journaled raid4/5/6 */
+ if (test_bit(__CTR_FLAG_JOURNAL_DEV, &rs->ctr_flags)) {
+ ti->error = "Can't takeover a journaled raid4/5/6 set";
+ r = -EPERM;
+ goto bad;
+ }
+
/*
* If a takeover is needed, userspace sets any additional
* devices to rebuild and we can check for a valid request here.
@@ -2924,6 +3097,18 @@ static int raid_ctr(struct dm_target *ti, unsigned int argc, char **argv)
rs_set_new(rs);
} else if (rs_reshape_requested(rs)) {
/*
+ * No need to check for 'ongoing' takeover here, because takeover
+ * is an instant operation as oposed to an ongoing reshape.
+ */
+
+ /* We can't reshape a journaled raid4/5/6 */
+ if (test_bit(__CTR_FLAG_JOURNAL_DEV, &rs->ctr_flags)) {
+ ti->error = "Can't reshape a journaled raid4/5/6 set";
+ r = -EPERM;
+ goto bad;
+ }
+
+ /*
* We can only prepare for a reshape here, because the
* raid set needs to run to provide the repective reshape
* check functions via its MD personality instance.
@@ -2972,6 +3157,16 @@ static int raid_ctr(struct dm_target *ti, unsigned int argc, char **argv)
rs->callbacks.congested_fn = raid_is_congested;
dm_table_add_target_callbacks(ti->table, &rs->callbacks);
+ /* If raid4/5/6 journal mode explictely requested (only possible with journal dev) -> set it */
+ if (test_bit(__CTR_FLAG_JOURNAL_MODE, &rs->ctr_flags)) {
+ r = r5c_journal_mode_set(&rs->md, rs->journal_dev.mode);
+ if (r) {
+ ti->error = "Failed to set raid4/5/6 journal mode";
+ mddev_unlock(&rs->md);
+ goto bad_journal_mode_set;
+ }
+ }
+
mddev_suspend(&rs->md);
/* Try to adjust the raid4/5/6 stripe cache size to the stripe size */
@@ -3005,6 +3200,7 @@ static int raid_ctr(struct dm_target *ti, unsigned int argc, char **argv)
mddev_unlock(&rs->md);
return 0;
+bad_journal_mode_set:
bad_stripe_cache:
bad_check_reshape:
md_stop(&rs->md);
@@ -3071,18 +3267,23 @@ static const char *decipher_sync_action(struct mddev *mddev)
}
/*
- * Return status string @rdev
+ * Return status string for @rdev
*
* Status characters:
*
- * 'D' = Dead/Failed device
- * 'a' = Alive but not in-sync
- * 'A' = Alive and in-sync
+ * 'D' = Dead/Failed raid set component or raid4/5/6 journal device
+ * 'a' = Alive but not in-sync raid set component _or_ alive raid4/5/6 'write_back' journal device
+ * 'A' = Alive and in-sync raid set component _or_ alive raid4/5/6 'write_through' journal device
+ * '-' = Non-existing device (i.e. uspace passed '- -' into the ctr)
*/
-static const char *__raid_dev_status(struct md_rdev *rdev, bool array_in_sync)
+static const char *__raid_dev_status(struct raid_set *rs, struct md_rdev *rdev, bool array_in_sync)
{
- if (test_bit(Faulty, &rdev->flags))
+ if (!rdev->bdev)
+ return "-";
+ else if (test_bit(Faulty, &rdev->flags))
return "D";
+ else if (test_bit(Journal, &rdev->flags))
+ return (rs->journal_dev.mode == R5C_JOURNAL_MODE_WRITE_THROUGH) ? "A" : "a";
else if (!array_in_sync || !test_bit(In_sync, &rdev->flags))
return "a";
else
@@ -3151,7 +3352,8 @@ static sector_t rs_get_progress(struct raid_set *rs,
* being initialized.
*/
rdev_for_each(rdev, mddev)
- if (!test_bit(In_sync, &rdev->flags))
+ if (!test_bit(Journal, &rdev->flags) &&
+ !test_bit(In_sync, &rdev->flags))
*array_in_sync = true;
#if 0
r = 0; /* HM FIXME: TESTME: https://bugzilla.redhat.com/show_bug.cgi?id=1210637 ? */
@@ -3183,7 +3385,6 @@ static void raid_status(struct dm_target *ti, status_type_t type,
sector_t progress, resync_max_sectors, resync_mismatches;
const char *sync_action;
struct raid_type *rt;
- struct md_rdev *rdev;
switch (type) {
case STATUSTYPE_INFO:
@@ -3204,9 +3405,9 @@ static void raid_status(struct dm_target *ti, status_type_t type,
atomic64_read(&mddev->resync_mismatches) : 0;
sync_action = decipher_sync_action(&rs->md);
- /* HM FIXME: do we want another state char for raid0? It shows 'D' or 'A' now */
- rdev_for_each(rdev, mddev)
- DMEMIT(__raid_dev_status(rdev, array_in_sync));
+ /* HM FIXME: do we want another state char for raid0? It shows 'D'/'A'/'-' now */
+ for (i = 0; i < rs->raid_disks; i++)
+ DMEMIT(__raid_dev_status(rs, &rs->dev[i].rdev, array_in_sync));
/*
* In-sync/Reshape ratio:
@@ -3252,6 +3453,12 @@ static void raid_status(struct dm_target *ti, status_type_t type,
* so retrieving it from the first raid disk is sufficient.
*/
DMEMIT(" %llu", (unsigned long long) rs->dev[0].rdev.data_offset);
+
+ /*
+ * v1.10.0+:
+ */
+ DMEMIT(" %s", test_bit(__CTR_FLAG_JOURNAL_DEV, &rs->ctr_flags) ?
+ __raid_dev_status(rs, &rs->journal_dev.rdev, 0) : "-");
break;
case STATUSTYPE_TABLE:
@@ -3265,39 +3472,31 @@ static void raid_status(struct dm_target *ti, status_type_t type,
raid_param_cnt += rebuild_disks * 2 +
write_mostly_params +
hweight32(rs->ctr_flags & CTR_FLAG_OPTIONS_NO_ARGS) +
- hweight32(rs->ctr_flags & CTR_FLAG_OPTIONS_ONE_ARG) * 2;
+ hweight32(rs->ctr_flags & CTR_FLAG_OPTIONS_ONE_ARG) * 2 +
+ (test_bit(__CTR_FLAG_JOURNAL_DEV, &rs->ctr_flags) ? 2 : 0) +
+ (test_bit(__CTR_FLAG_JOURNAL_MODE, &rs->ctr_flags) ? 2 : 0);
+
/* Emit table line */
+ /* This has to be in the documented order for userspace! */
DMEMIT("%s %u %u", rs->raid_type->name, raid_param_cnt, mddev->new_chunk_sectors);
- if (test_bit(__CTR_FLAG_RAID10_FORMAT, &rs->ctr_flags))
- DMEMIT(" %s %s", dm_raid_arg_name_by_flag(CTR_FLAG_RAID10_FORMAT),
- raid10_md_layout_to_format(mddev->layout));
- if (test_bit(__CTR_FLAG_RAID10_COPIES, &rs->ctr_flags))
- DMEMIT(" %s %d", dm_raid_arg_name_by_flag(CTR_FLAG_RAID10_COPIES),
- raid10_md_layout_to_copies(mddev->layout));
- if (test_bit(__CTR_FLAG_NOSYNC, &rs->ctr_flags))
- DMEMIT(" %s", dm_raid_arg_name_by_flag(CTR_FLAG_NOSYNC));
if (test_bit(__CTR_FLAG_SYNC, &rs->ctr_flags))
DMEMIT(" %s", dm_raid_arg_name_by_flag(CTR_FLAG_SYNC));
- if (test_bit(__CTR_FLAG_REGION_SIZE, &rs->ctr_flags))
- DMEMIT(" %s %llu", dm_raid_arg_name_by_flag(CTR_FLAG_REGION_SIZE),
- (unsigned long long) to_sector(mddev->bitmap_info.chunksize));
- if (test_bit(__CTR_FLAG_DATA_OFFSET, &rs->ctr_flags))
- DMEMIT(" %s %llu", dm_raid_arg_name_by_flag(CTR_FLAG_DATA_OFFSET),
- (unsigned long long) rs->data_offset);
- if (test_bit(__CTR_FLAG_DAEMON_SLEEP, &rs->ctr_flags))
- DMEMIT(" %s %lu", dm_raid_arg_name_by_flag(CTR_FLAG_DAEMON_SLEEP),
- mddev->bitmap_info.daemon_sleep);
- if (test_bit(__CTR_FLAG_DELTA_DISKS, &rs->ctr_flags))
- DMEMIT(" %s %d", dm_raid_arg_name_by_flag(CTR_FLAG_DELTA_DISKS),
- max(rs->delta_disks, mddev->delta_disks));
- if (test_bit(__CTR_FLAG_STRIPE_CACHE, &rs->ctr_flags))
- DMEMIT(" %s %d", dm_raid_arg_name_by_flag(CTR_FLAG_STRIPE_CACHE),
- max_nr_stripes);
+ if (test_bit(__CTR_FLAG_NOSYNC, &rs->ctr_flags))
+ DMEMIT(" %s", dm_raid_arg_name_by_flag(CTR_FLAG_NOSYNC));
if (rebuild_disks)
for (i = 0; i < rs->raid_disks; i++)
if (test_bit(rs->dev[i].rdev.raid_disk, (void *) rs->rebuild_disks))
DMEMIT(" %s %u", dm_raid_arg_name_by_flag(CTR_FLAG_REBUILD),
rs->dev[i].rdev.raid_disk);
+ if (test_bit(__CTR_FLAG_DAEMON_SLEEP, &rs->ctr_flags))
+ DMEMIT(" %s %lu", dm_raid_arg_name_by_flag(CTR_FLAG_DAEMON_SLEEP),
+ mddev->bitmap_info.daemon_sleep);
+ if (test_bit(__CTR_FLAG_MIN_RECOVERY_RATE, &rs->ctr_flags))
+ DMEMIT(" %s %d", dm_raid_arg_name_by_flag(CTR_FLAG_MIN_RECOVERY_RATE),
+ mddev->sync_speed_min);
+ if (test_bit(__CTR_FLAG_MAX_RECOVERY_RATE, &rs->ctr_flags))
+ DMEMIT(" %s %d", dm_raid_arg_name_by_flag(CTR_FLAG_MAX_RECOVERY_RATE),
+ mddev->sync_speed_max);
if (write_mostly_params)
for (i = 0; i < rs->raid_disks; i++)
if (test_bit(WriteMostly, &rs->dev[i].rdev.flags))
@@ -3306,12 +3505,30 @@ static void raid_status(struct dm_target *ti, status_type_t type,
if (test_bit(__CTR_FLAG_MAX_WRITE_BEHIND, &rs->ctr_flags))
DMEMIT(" %s %lu", dm_raid_arg_name_by_flag(CTR_FLAG_MAX_WRITE_BEHIND),
mddev->bitmap_info.max_write_behind);
- if (test_bit(__CTR_FLAG_MAX_RECOVERY_RATE, &rs->ctr_flags))
- DMEMIT(" %s %d", dm_raid_arg_name_by_flag(CTR_FLAG_MAX_RECOVERY_RATE),
- mddev->sync_speed_max);
- if (test_bit(__CTR_FLAG_MIN_RECOVERY_RATE, &rs->ctr_flags))
- DMEMIT(" %s %d", dm_raid_arg_name_by_flag(CTR_FLAG_MIN_RECOVERY_RATE),
- mddev->sync_speed_min);
+ if (test_bit(__CTR_FLAG_STRIPE_CACHE, &rs->ctr_flags))
+ DMEMIT(" %s %d", dm_raid_arg_name_by_flag(CTR_FLAG_STRIPE_CACHE),
+ max_nr_stripes);
+ if (test_bit(__CTR_FLAG_REGION_SIZE, &rs->ctr_flags))
+ DMEMIT(" %s %llu", dm_raid_arg_name_by_flag(CTR_FLAG_REGION_SIZE),
+ (unsigned long long) to_sector(mddev->bitmap_info.chunksize));
+ if (test_bit(__CTR_FLAG_RAID10_COPIES, &rs->ctr_flags))
+ DMEMIT(" %s %d", dm_raid_arg_name_by_flag(CTR_FLAG_RAID10_COPIES),
+ raid10_md_layout_to_copies(mddev->layout));
+ if (test_bit(__CTR_FLAG_RAID10_FORMAT, &rs->ctr_flags))
+ DMEMIT(" %s %s", dm_raid_arg_name_by_flag(CTR_FLAG_RAID10_FORMAT),
+ raid10_md_layout_to_format(mddev->layout));
+ if (test_bit(__CTR_FLAG_DELTA_DISKS, &rs->ctr_flags))
+ DMEMIT(" %s %d", dm_raid_arg_name_by_flag(CTR_FLAG_DELTA_DISKS),
+ max(rs->delta_disks, mddev->delta_disks));
+ if (test_bit(__CTR_FLAG_DATA_OFFSET, &rs->ctr_flags))
+ DMEMIT(" %s %llu", dm_raid_arg_name_by_flag(CTR_FLAG_DATA_OFFSET),
+ (unsigned long long) rs->data_offset);
+ if (test_bit(__CTR_FLAG_JOURNAL_DEV, &rs->ctr_flags))
+ DMEMIT(" %s %s", dm_raid_arg_name_by_flag(CTR_FLAG_JOURNAL_DEV),
+ __get_dev_name(rs->journal_dev.dev));
+ if (test_bit(__CTR_FLAG_JOURNAL_MODE, &rs->ctr_flags))
+ DMEMIT(" %s %s", dm_raid_arg_name_by_flag(CTR_FLAG_JOURNAL_MODE),
+ md_journal_mode_to_dm_raid(rs->journal_dev.mode));
DMEMIT(" %d", rs->raid_disks);
for (i = 0; i < rs->raid_disks; i++)
DMEMIT(" %s %s", __get_dev_name(rs->dev[i].meta_dev),
@@ -3345,12 +3562,15 @@ static int raid_message(struct dm_target *ti, unsigned int argc, char **argv)
else if (!strcasecmp(argv[0], "recover"))
set_bit(MD_RECOVERY_RECOVER, &mddev->recovery);
else {
- if (!strcasecmp(argv[0], "check"))
+ if (!strcasecmp(argv[0], "check")) {
set_bit(MD_RECOVERY_CHECK, &mddev->recovery);
- else if (!!strcasecmp(argv[0], "repair"))
+ set_bit(MD_RECOVERY_REQUESTED, &mddev->recovery);
+ set_bit(MD_RECOVERY_SYNC, &mddev->recovery);
+ } else if (!strcasecmp(argv[0], "repair")) {
+ set_bit(MD_RECOVERY_REQUESTED, &mddev->recovery);
+ set_bit(MD_RECOVERY_SYNC, &mddev->recovery);
+ } else
return -EINVAL;
- set_bit(MD_RECOVERY_REQUESTED, &mddev->recovery);
- set_bit(MD_RECOVERY_SYNC, &mddev->recovery);
}
if (mddev->ro == 2) {
/* A write to sync_action is enough to justify
@@ -3427,11 +3647,14 @@ static void attempt_restore_of_faulty_devices(struct raid_set *rs)
memset(cleared_failed_devices, 0, sizeof(cleared_failed_devices));
- for (i = 0; i < rs->md.raid_disks; i++) {
+ for (i = 0; i < mddev->raid_disks; i++) {
r = &rs->dev[i].rdev;
- if (test_bit(Faulty, &r->flags) && r->sb_page &&
- sync_page_io(r, 0, r->sb_size, r->sb_page,
- REQ_OP_READ, 0, true)) {
+ /* HM FIXME: enhance journal device recovery processing */
+ if (test_bit(Journal, &r->flags))
+ continue;
+
+ if (test_bit(Faulty, &r->flags) &&
+ r->meta_bdev && !read_disk_sb(r, r->sb_size, true)) {
DMINFO("Faulty %s device #%d has readable super block."
" Attempting to revive it.",
rs->raid_type->name, i);
@@ -3445,22 +3668,26 @@ static void attempt_restore_of_faulty_devices(struct raid_set *rs)
* '>= 0' - meaning we must call this function
* ourselves.
*/
- if ((r->raid_disk >= 0) &&
- (mddev->pers->hot_remove_disk(mddev, r) != 0))
- /* Failed to revive this device, try next */
- continue;
-
- r->raid_disk = i;
- r->saved_raid_disk = i;
flags = r->flags;
+ clear_bit(In_sync, &r->flags); /* Mandatory for hot remove. */
+ if (r->raid_disk >= 0) {
+ if (mddev->pers->hot_remove_disk(mddev, r)) {
+ /* Failed to revive this device, try next */
+ r->flags = flags;
+ continue;
+ }
+ } else
+ r->raid_disk = r->saved_raid_disk = i;
+
clear_bit(Faulty, &r->flags);
clear_bit(WriteErrorSeen, &r->flags);
- clear_bit(In_sync, &r->flags);
+
if (mddev->pers->hot_add_disk(mddev, r)) {
- r->raid_disk = -1;
- r->saved_raid_disk = -1;
+ /* Failed to revive this device, try next */
+ r->raid_disk = r->saved_raid_disk = -1;
r->flags = flags;
} else {
+ clear_bit(In_sync, &r->flags);
r->recovery_offset = 0;
set_bit(i, (void *) cleared_failed_devices);
cleared = true;
@@ -3473,6 +3700,9 @@ static void attempt_restore_of_faulty_devices(struct raid_set *rs)
uint64_t failed_devices[DISKS_ARRAY_ELEMS];
rdev_for_each(r, &rs->md) {
+ if (test_bit(Journal, &r->flags))
+ continue;
+
sb = page_address(r->sb_page);
sb_retrieve_failed_devices(sb, failed_devices);
@@ -3594,7 +3824,7 @@ static int raid_preresume(struct dm_target *ti)
return r;
/* Resize bitmap to adjust to changed region size (aka MD bitmap chunksize) */
- if (test_bit(RT_FLAG_RS_BITMAP_LOADED, &rs->runtime_flags) &&
+ if (test_bit(RT_FLAG_RS_BITMAP_LOADED, &rs->runtime_flags) && mddev->bitmap &&
mddev->bitmap_info.chunksize != to_bytes(rs->requested_bitmap_chunk_sectors)) {
r = bitmap_resize(mddev->bitmap, mddev->dev_sectors,
to_bytes(rs->requested_bitmap_chunk_sectors), 0);
@@ -3643,7 +3873,15 @@ static void raid_resume(struct dm_target *ti)
mddev->ro = 0;
mddev->in_sync = 0;
- clear_bit(MD_RECOVERY_FROZEN, &mddev->recovery);
+ /*
+ * Keep the RAID set frozen if reshape/rebuild flags are set.
+ * The RAID set is unfrozen once the next table load/resume,
+ * which clears the reshape/rebuild flags, occurs.
+ * This ensures that the constructor for the inactive table
+ * retrieves an up-to-date reshape_position.
+ */
+ if (!(rs->ctr_flags & RESUME_STAY_FROZEN_FLAGS))
+ clear_bit(MD_RECOVERY_FROZEN, &mddev->recovery);
if (mddev->suspended)
mddev_resume(mddev);
@@ -3651,7 +3889,7 @@ static void raid_resume(struct dm_target *ti)
static struct target_type raid_target = {
.name = "raid",
- .version = {1, 9, 1},
+ .version = {1, 11, 1},
.module = THIS_MODULE,
.ctr = raid_ctr,
.dtr = raid_dtr,
diff --git a/drivers/md/dm-raid1.c b/drivers/md/dm-raid1.c
index 2ddc2d20e62d..a4fbd911d566 100644
--- a/drivers/md/dm-raid1.c
+++ b/drivers/md/dm-raid1.c
@@ -145,6 +145,7 @@ static void dispatch_bios(void *context, struct bio_list *bio_list)
struct dm_raid1_bio_record {
struct mirror *m;
+ /* if details->bi_bdev == NULL, details were not saved */
struct dm_bio_details details;
region_t write_region;
};
@@ -260,7 +261,7 @@ static int mirror_flush(struct dm_target *ti)
struct mirror *m;
struct dm_io_request io_req = {
.bi_op = REQ_OP_WRITE,
- .bi_op_flags = REQ_PREFLUSH,
+ .bi_op_flags = REQ_PREFLUSH | REQ_SYNC,
.mem.type = DM_IO_KMEM,
.mem.ptr.addr = NULL,
.client = ms->io_client,
@@ -490,9 +491,9 @@ static void hold_bio(struct mirror_set *ms, struct bio *bio)
* If device is suspended, complete the bio.
*/
if (dm_noflush_suspending(ms->ti))
- bio->bi_error = DM_ENDIO_REQUEUE;
+ bio->bi_status = BLK_STS_DM_REQUEUE;
else
- bio->bi_error = -EIO;
+ bio->bi_status = BLK_STS_IOERR;
bio_endio(bio);
return;
@@ -626,7 +627,7 @@ static void write_callback(unsigned long error, void *context)
* degrade the array.
*/
if (bio_op(bio) == REQ_OP_DISCARD) {
- bio->bi_error = -EOPNOTSUPP;
+ bio->bi_status = BLK_STS_NOTSUPP;
bio_endio(bio);
return;
}
@@ -1124,7 +1125,6 @@ static int mirror_ctr(struct dm_target *ti, unsigned int argc, char **argv)
ti->num_flush_bios = 1;
ti->num_discard_bios = 1;
ti->per_io_data_size = sizeof(struct dm_raid1_bio_record);
- ti->discard_zeroes_data_unsupported = true;
ms->kmirrord_wq = alloc_workqueue("kmirrord", WQ_MEM_RECLAIM, 0);
if (!ms->kmirrord_wq) {
@@ -1199,6 +1199,8 @@ static int mirror_map(struct dm_target *ti, struct bio *bio)
struct dm_raid1_bio_record *bio_record =
dm_per_bio_data(bio, sizeof(struct dm_raid1_bio_record));
+ bio_record->details.bi_bdev = NULL;
+
if (rw == WRITE) {
/* Save region for mirror_end_io() handler */
bio_record->write_region = dm_rh_bio_to_region(ms->rh, bio);
@@ -1208,14 +1210,14 @@ static int mirror_map(struct dm_target *ti, struct bio *bio)
r = log->type->in_sync(log, dm_rh_bio_to_region(ms->rh, bio), 0);
if (r < 0 && r != -EWOULDBLOCK)
- return r;
+ return DM_MAPIO_KILL;
/*
* If region is not in-sync queue the bio.
*/
if (!r || (r == -EWOULDBLOCK)) {
if (bio->bi_opf & REQ_RAHEAD)
- return -EWOULDBLOCK;
+ return DM_MAPIO_KILL;
queue_bio(ms, bio, rw);
return DM_MAPIO_SUBMITTED;
@@ -1227,7 +1229,7 @@ static int mirror_map(struct dm_target *ti, struct bio *bio)
*/
m = choose_mirror(ms, bio->bi_iter.bi_sector);
if (unlikely(!m))
- return -EIO;
+ return DM_MAPIO_KILL;
dm_bio_record(&bio_record->details, bio);
bio_record->m = m;
@@ -1237,7 +1239,8 @@ static int mirror_map(struct dm_target *ti, struct bio *bio)
return DM_MAPIO_REMAPPED;
}
-static int mirror_end_io(struct dm_target *ti, struct bio *bio, int error)
+static int mirror_end_io(struct dm_target *ti, struct bio *bio,
+ blk_status_t *error)
{
int rw = bio_data_dir(bio);
struct mirror_set *ms = (struct mirror_set *) ti->private;
@@ -1253,16 +1256,26 @@ static int mirror_end_io(struct dm_target *ti, struct bio *bio, int error)
if (!(bio->bi_opf & REQ_PREFLUSH) &&
bio_op(bio) != REQ_OP_DISCARD)
dm_rh_dec(ms->rh, bio_record->write_region);
- return error;
+ return DM_ENDIO_DONE;
}
- if (error == -EOPNOTSUPP)
- return error;
+ if (*error == BLK_STS_NOTSUPP)
+ goto out;
+
+ if (bio->bi_opf & REQ_RAHEAD)
+ goto out;
- if ((error == -EWOULDBLOCK) && (bio->bi_opf & REQ_RAHEAD))
- return error;
+ if (unlikely(*error)) {
+ if (!bio_record->details.bi_bdev) {
+ /*
+ * There wasn't enough memory to record necessary
+ * information for a retry or there was no other
+ * mirror in-sync.
+ */
+ DMERR_LIMIT("Mirror read failed.");
+ return DM_ENDIO_DONE;
+ }
- if (unlikely(error)) {
m = bio_record->m;
DMERR("Mirror read failed from %s. Trying alternative device.",
@@ -1278,7 +1291,8 @@ static int mirror_end_io(struct dm_target *ti, struct bio *bio, int error)
bd = &bio_record->details;
dm_bio_restore(bd, bio);
- bio->bi_error = 0;
+ bio_record->details.bi_bdev = NULL;
+ bio->bi_status = 0;
queue_bio(ms, bio, rw);
return DM_ENDIO_INCOMPLETE;
@@ -1286,7 +1300,10 @@ static int mirror_end_io(struct dm_target *ti, struct bio *bio, int error)
DMERR("All replicated volumes dead, failing I/O");
}
- return error;
+out:
+ bio_record->details.bi_bdev = NULL;
+
+ return DM_ENDIO_DONE;
}
static void mirror_presuspend(struct dm_target *ti)
diff --git a/drivers/md/dm-round-robin.c b/drivers/md/dm-round-robin.c
index 6c25213ab38c..bdbb7e6e8212 100644
--- a/drivers/md/dm-round-robin.c
+++ b/drivers/md/dm-round-robin.c
@@ -17,8 +17,8 @@
#include <linux/module.h>
#define DM_MSG_PREFIX "multipath round-robin"
-#define RR_MIN_IO 1000
-#define RR_VERSION "1.1.0"
+#define RR_MIN_IO 1
+#define RR_VERSION "1.2.0"
/*-----------------------------------------------------------------
* Path-handling code, paths are held in lists
@@ -47,44 +47,19 @@ struct selector {
struct list_head valid_paths;
struct list_head invalid_paths;
spinlock_t lock;
- struct dm_path * __percpu *current_path;
- struct percpu_counter repeat_count;
};
-static void set_percpu_current_path(struct selector *s, struct dm_path *path)
-{
- int cpu;
-
- for_each_possible_cpu(cpu)
- *per_cpu_ptr(s->current_path, cpu) = path;
-}
-
static struct selector *alloc_selector(void)
{
struct selector *s = kmalloc(sizeof(*s), GFP_KERNEL);
- if (!s)
- return NULL;
-
- INIT_LIST_HEAD(&s->valid_paths);
- INIT_LIST_HEAD(&s->invalid_paths);
- spin_lock_init(&s->lock);
-
- s->current_path = alloc_percpu(struct dm_path *);
- if (!s->current_path)
- goto out_current_path;
- set_percpu_current_path(s, NULL);
-
- if (percpu_counter_init(&s->repeat_count, 0, GFP_KERNEL))
- goto out_repeat_count;
+ if (s) {
+ INIT_LIST_HEAD(&s->valid_paths);
+ INIT_LIST_HEAD(&s->invalid_paths);
+ spin_lock_init(&s->lock);
+ }
return s;
-
-out_repeat_count:
- free_percpu(s->current_path);
-out_current_path:
- kfree(s);
- return NULL;;
}
static int rr_create(struct path_selector *ps, unsigned argc, char **argv)
@@ -105,8 +80,6 @@ static void rr_destroy(struct path_selector *ps)
free_paths(&s->valid_paths);
free_paths(&s->invalid_paths);
- free_percpu(s->current_path);
- percpu_counter_destroy(&s->repeat_count);
kfree(s);
ps->context = NULL;
}
@@ -157,6 +130,11 @@ static int rr_add_path(struct path_selector *ps, struct dm_path *path,
return -EINVAL;
}
+ if (repeat_count > 1) {
+ DMWARN_LIMIT("repeat_count > 1 is deprecated, using 1 instead");
+ repeat_count = 1;
+ }
+
/* allocate the path */
pi = kmalloc(sizeof(*pi), GFP_KERNEL);
if (!pi) {
@@ -183,9 +161,6 @@ static void rr_fail_path(struct path_selector *ps, struct dm_path *p)
struct path_info *pi = p->pscontext;
spin_lock_irqsave(&s->lock, flags);
- if (p == *this_cpu_ptr(s->current_path))
- set_percpu_current_path(s, NULL);
-
list_move(&pi->list, &s->invalid_paths);
spin_unlock_irqrestore(&s->lock, flags);
}
@@ -208,29 +183,15 @@ static struct dm_path *rr_select_path(struct path_selector *ps, size_t nr_bytes)
unsigned long flags;
struct selector *s = ps->context;
struct path_info *pi = NULL;
- struct dm_path *current_path = NULL;
-
- local_irq_save(flags);
- current_path = *this_cpu_ptr(s->current_path);
- if (current_path) {
- percpu_counter_dec(&s->repeat_count);
- if (percpu_counter_read_positive(&s->repeat_count) > 0) {
- local_irq_restore(flags);
- return current_path;
- }
- }
- spin_lock(&s->lock);
+ spin_lock_irqsave(&s->lock, flags);
if (!list_empty(&s->valid_paths)) {
pi = list_entry(s->valid_paths.next, struct path_info, list);
list_move_tail(&pi->list, &s->valid_paths);
- percpu_counter_set(&s->repeat_count, pi->repeat_count);
- set_percpu_current_path(s, pi->path);
- current_path = pi->path;
}
spin_unlock_irqrestore(&s->lock, flags);
- return current_path;
+ return pi ? pi->path : NULL;
}
static struct path_selector_type rr_ps = {
diff --git a/drivers/md/dm-rq.c b/drivers/md/dm-rq.c
index 6e702fc69a83..c6ebc5b1e00e 100644
--- a/drivers/md/dm-rq.c
+++ b/drivers/md/dm-rq.c
@@ -71,7 +71,7 @@ static void dm_old_start_queue(struct request_queue *q)
static void dm_mq_start_queue(struct request_queue *q)
{
- blk_mq_start_stopped_hw_queues(q, true);
+ blk_mq_unquiesce_queue(q);
blk_mq_kick_requeue_list(q);
}
@@ -109,28 +109,6 @@ void dm_stop_queue(struct request_queue *q)
dm_mq_stop_queue(q);
}
-static struct dm_rq_target_io *alloc_old_rq_tio(struct mapped_device *md,
- gfp_t gfp_mask)
-{
- return mempool_alloc(md->io_pool, gfp_mask);
-}
-
-static void free_old_rq_tio(struct dm_rq_target_io *tio)
-{
- mempool_free(tio, tio->md->io_pool);
-}
-
-static struct request *alloc_old_clone_request(struct mapped_device *md,
- gfp_t gfp_mask)
-{
- return mempool_alloc(md->rq_pool, gfp_mask);
-}
-
-static void free_old_clone_request(struct mapped_device *md, struct request *rq)
-{
- mempool_free(rq, md->rq_pool);
-}
-
/*
* Partial completion handling for request-based dm
*/
@@ -141,7 +119,7 @@ static void end_clone_bio(struct bio *clone)
struct dm_rq_target_io *tio = info->tio;
struct bio *bio = info->orig;
unsigned int nr_bytes = info->orig->bi_iter.bi_size;
- int error = clone->bi_error;
+ blk_status_t error = clone->bi_status;
bio_put(clone);
@@ -180,12 +158,12 @@ static void end_clone_bio(struct bio *clone)
* Do not use blk_end_request() here, because it may complete
* the original request before the clone, and break the ordering.
*/
- blk_update_request(tio->orig, 0, nr_bytes);
+ blk_update_request(tio->orig, BLK_STS_OK, nr_bytes);
}
static struct dm_rq_target_io *tio_from_request(struct request *rq)
{
- return (rq->q->mq_ops ? blk_mq_rq_to_pdu(rq) : rq->special);
+ return blk_mq_rq_to_pdu(rq);
}
static void rq_end_stats(struct mapped_device *md, struct request *orig)
@@ -233,57 +211,21 @@ static void rq_completed(struct mapped_device *md, int rw, bool run_queue)
dm_put(md);
}
-static void free_rq_clone(struct request *clone)
-{
- struct dm_rq_target_io *tio = clone->end_io_data;
- struct mapped_device *md = tio->md;
-
- blk_rq_unprep_clone(clone);
-
- /*
- * It is possible for a clone_old_rq() allocated clone to
- * get passed in -- it may not yet have a request_queue.
- * This is known to occur if the error target replaces
- * a multipath target that has a request_fn queue stacked
- * on blk-mq queue(s).
- */
- if (clone->q && clone->q->mq_ops)
- /* stacked on blk-mq queue(s) */
- tio->ti->type->release_clone_rq(clone);
- else if (!md->queue->mq_ops)
- /* request_fn queue stacked on request_fn queue(s) */
- free_old_clone_request(md, clone);
-
- if (!md->queue->mq_ops)
- free_old_rq_tio(tio);
-}
-
/*
* Complete the clone and the original request.
* Must be called without clone's queue lock held,
* see end_clone_request() for more details.
*/
-static void dm_end_request(struct request *clone, int error)
+static void dm_end_request(struct request *clone, blk_status_t error)
{
int rw = rq_data_dir(clone);
struct dm_rq_target_io *tio = clone->end_io_data;
struct mapped_device *md = tio->md;
struct request *rq = tio->orig;
- if (rq->cmd_type == REQ_TYPE_BLOCK_PC) {
- rq->errors = clone->errors;
- rq->resid_len = clone->resid_len;
-
- if (rq->sense)
- /*
- * We are using the sense buffer of the original
- * request.
- * So setting the length of the sense data is enough.
- */
- rq->sense_len = clone->sense_len;
- }
+ blk_rq_unprep_clone(clone);
+ tio->ti->type->release_clone_rq(clone);
- free_rq_clone(clone);
rq_end_stats(md, rq);
if (!rq->q->mq_ops)
blk_end_request_all(rq, error);
@@ -292,22 +234,6 @@ static void dm_end_request(struct request *clone, int error)
rq_completed(md, rw, true);
}
-static void dm_unprep_request(struct request *rq)
-{
- struct dm_rq_target_io *tio = tio_from_request(rq);
- struct request *clone = tio->clone;
-
- if (!rq->q->mq_ops) {
- rq->special = NULL;
- rq->rq_flags &= ~RQF_DONTPREP;
- }
-
- if (clone)
- free_rq_clone(clone);
- else if (!tio->md->queue->mq_ops)
- free_old_rq_tio(tio);
-}
-
/*
* Requeue the original request of a clone.
*/
@@ -346,19 +272,22 @@ static void dm_requeue_original_request(struct dm_rq_target_io *tio, bool delay_
int rw = rq_data_dir(rq);
rq_end_stats(md, rq);
- dm_unprep_request(rq);
+ if (tio->clone) {
+ blk_rq_unprep_clone(tio->clone);
+ tio->ti->type->release_clone_rq(tio->clone);
+ }
if (!rq->q->mq_ops)
dm_old_requeue_request(rq);
else
- dm_mq_delay_requeue_request(rq, delay_requeue ? 5000 : 0);
+ dm_mq_delay_requeue_request(rq, delay_requeue ? 100/*ms*/ : 0);
rq_completed(md, rw, false);
}
-static void dm_done(struct request *clone, int error, bool mapped)
+static void dm_done(struct request *clone, blk_status_t error, bool mapped)
{
- int r = error;
+ int r = DM_ENDIO_DONE;
struct dm_rq_target_io *tio = clone->end_io_data;
dm_request_endio_fn rq_end_io = NULL;
@@ -369,20 +298,28 @@ static void dm_done(struct request *clone, int error, bool mapped)
r = rq_end_io(tio->ti, clone, error, &tio->info);
}
- if (unlikely(r == -EREMOTEIO && (req_op(clone) == REQ_OP_WRITE_SAME) &&
- !clone->q->limits.max_write_same_sectors))
- disable_write_same(tio->md);
+ if (unlikely(error == BLK_STS_TARGET)) {
+ if (req_op(clone) == REQ_OP_WRITE_SAME &&
+ !clone->q->limits.max_write_same_sectors)
+ disable_write_same(tio->md);
+ if (req_op(clone) == REQ_OP_WRITE_ZEROES &&
+ !clone->q->limits.max_write_zeroes_sectors)
+ disable_write_zeroes(tio->md);
+ }
- if (r <= 0)
+ switch (r) {
+ case DM_ENDIO_DONE:
/* The target wants to complete the I/O */
- dm_end_request(clone, r);
- else if (r == DM_ENDIO_INCOMPLETE)
+ dm_end_request(clone, error);
+ break;
+ case DM_ENDIO_INCOMPLETE:
/* The target will handle the I/O */
return;
- else if (r == DM_ENDIO_REQUEUE)
+ case DM_ENDIO_REQUEUE:
/* The target wants to requeue the I/O */
dm_requeue_original_request(tio, false);
- else {
+ break;
+ default:
DMWARN("unimplemented target endio return value: %d", r);
BUG();
}
@@ -399,16 +336,15 @@ static void dm_softirq_done(struct request *rq)
int rw;
if (!clone) {
- rq_end_stats(tio->md, rq);
+ struct mapped_device *md = tio->md;
+
+ rq_end_stats(md, rq);
rw = rq_data_dir(rq);
- if (!rq->q->mq_ops) {
+ if (!rq->q->mq_ops)
blk_end_request_all(rq, tio->error);
- rq_completed(tio->md, rw, false);
- free_old_rq_tio(tio);
- } else {
+ else
blk_mq_end_request(rq, tio->error);
- rq_completed(tio->md, rw, false);
- }
+ rq_completed(md, rw, false);
return;
}
@@ -422,7 +358,7 @@ static void dm_softirq_done(struct request *rq)
* Complete the clone and the original request with the error status
* through softirq context.
*/
-static void dm_complete_request(struct request *rq, int error)
+static void dm_complete_request(struct request *rq, blk_status_t error)
{
struct dm_rq_target_io *tio = tio_from_request(rq);
@@ -430,7 +366,7 @@ static void dm_complete_request(struct request *rq, int error)
if (!rq->q->mq_ops)
blk_complete_request(rq);
else
- blk_mq_complete_request(rq, error);
+ blk_mq_complete_request(rq);
}
/*
@@ -439,7 +375,7 @@ static void dm_complete_request(struct request *rq, int error)
* Target's rq_end_io() function isn't called.
* This may be used when the target's map_rq() or clone_and_map_rq() functions fail.
*/
-static void dm_kill_unmapped_request(struct request *rq, int error)
+static void dm_kill_unmapped_request(struct request *rq, blk_status_t error)
{
rq->rq_flags |= RQF_FAILED;
dm_complete_request(rq, error);
@@ -448,20 +384,10 @@ static void dm_kill_unmapped_request(struct request *rq, int error)
/*
* Called with the clone's queue lock held (in the case of .request_fn)
*/
-static void end_clone_request(struct request *clone, int error)
+static void end_clone_request(struct request *clone, blk_status_t error)
{
struct dm_rq_target_io *tio = clone->end_io_data;
- if (!clone->q->mq_ops) {
- /*
- * For just cleaning up the information of the queue in which
- * the clone was dispatched.
- * The clone is *NOT* freed actually here because it is alloced
- * from dm own mempool (RQF_ALLOCED isn't set).
- */
- __blk_put_request(clone->q, clone);
- }
-
/*
* Actual request completion is done in a softirq context which doesn't
* hold the clone's queue lock. Otherwise, deadlock could occur because:
@@ -475,7 +401,7 @@ static void end_clone_request(struct request *clone, int error)
static void dm_dispatch_clone_request(struct request *clone, struct request *rq)
{
- int r;
+ blk_status_t r;
if (blk_queue_io_stat(clone->q))
clone->rq_flags |= RQF_IO_STAT;
@@ -511,9 +437,6 @@ static int setup_clone(struct request *clone, struct request *rq,
if (r)
return r;
- clone->cmd = rq->cmd;
- clone->cmd_len = rq->cmd_len;
- clone->sense = rq->sense;
clone->end_io = end_clone_request;
clone->end_io_data = tio;
@@ -522,28 +445,6 @@ static int setup_clone(struct request *clone, struct request *rq,
return 0;
}
-static struct request *clone_old_rq(struct request *rq, struct mapped_device *md,
- struct dm_rq_target_io *tio, gfp_t gfp_mask)
-{
- /*
- * Create clone for use with .request_fn request_queue
- */
- struct request *clone;
-
- clone = alloc_old_clone_request(md, gfp_mask);
- if (!clone)
- return NULL;
-
- blk_rq_init(NULL, clone);
- if (setup_clone(clone, rq, tio, gfp_mask)) {
- /* -ENOMEM */
- free_old_clone_request(md, clone);
- return NULL;
- }
-
- return clone;
-}
-
static void map_tio_request(struct kthread_work *work);
static void init_tio(struct dm_rq_target_io *tio, struct request *rq,
@@ -565,60 +466,6 @@ static void init_tio(struct dm_rq_target_io *tio, struct request *rq,
kthread_init_work(&tio->work, map_tio_request);
}
-static struct dm_rq_target_io *dm_old_prep_tio(struct request *rq,
- struct mapped_device *md,
- gfp_t gfp_mask)
-{
- struct dm_rq_target_io *tio;
- int srcu_idx;
- struct dm_table *table;
-
- tio = alloc_old_rq_tio(md, gfp_mask);
- if (!tio)
- return NULL;
-
- init_tio(tio, rq, md);
-
- table = dm_get_live_table(md, &srcu_idx);
- /*
- * Must clone a request if this .request_fn DM device
- * is stacked on .request_fn device(s).
- */
- if (!dm_table_all_blk_mq_devices(table)) {
- if (!clone_old_rq(rq, md, tio, gfp_mask)) {
- dm_put_live_table(md, srcu_idx);
- free_old_rq_tio(tio);
- return NULL;
- }
- }
- dm_put_live_table(md, srcu_idx);
-
- return tio;
-}
-
-/*
- * Called with the queue lock held.
- */
-static int dm_old_prep_fn(struct request_queue *q, struct request *rq)
-{
- struct mapped_device *md = q->queuedata;
- struct dm_rq_target_io *tio;
-
- if (unlikely(rq->special)) {
- DMWARN("Already has something in rq->special.");
- return BLKPREP_KILL;
- }
-
- tio = dm_old_prep_tio(rq, md, GFP_ATOMIC);
- if (!tio)
- return BLKPREP_DEFER;
-
- rq->special = tio;
- rq->rq_flags |= RQF_DONTPREP;
-
- return BLKPREP_OK;
-}
-
/*
* Returns:
* DM_MAPIO_* : the request has been processed as indicated
@@ -633,31 +480,18 @@ static int map_request(struct dm_rq_target_io *tio)
struct request *rq = tio->orig;
struct request *clone = NULL;
- if (tio->clone) {
- clone = tio->clone;
- r = ti->type->map_rq(ti, clone, &tio->info);
- if (r == DM_MAPIO_DELAY_REQUEUE)
- return DM_MAPIO_REQUEUE; /* .request_fn requeue is always immediate */
- } else {
- r = ti->type->clone_and_map_rq(ti, rq, &tio->info, &clone);
- if (r < 0) {
- /* The target wants to complete the I/O */
- dm_kill_unmapped_request(rq, r);
- return r;
- }
- if (r == DM_MAPIO_REMAPPED &&
- setup_clone(clone, rq, tio, GFP_ATOMIC)) {
- /* -ENOMEM */
- ti->type->release_clone_rq(clone);
- return DM_MAPIO_REQUEUE;
- }
- }
-
+ r = ti->type->clone_and_map_rq(ti, rq, &tio->info, &clone);
switch (r) {
case DM_MAPIO_SUBMITTED:
/* The target has taken the I/O to submit by itself later */
break;
case DM_MAPIO_REMAPPED:
+ if (setup_clone(clone, rq, tio, GFP_ATOMIC)) {
+ /* -ENOMEM */
+ ti->type->release_clone_rq(clone);
+ return DM_MAPIO_REQUEUE;
+ }
+
/* The target has remapped the I/O so dispatch it */
trace_block_rq_remap(clone->q, clone, disk_devt(dm_disk(md)),
blk_rq_pos(rq));
@@ -670,14 +504,13 @@ static int map_request(struct dm_rq_target_io *tio)
/* The target wants to requeue the I/O after a delay */
dm_requeue_original_request(tio, true);
break;
- default:
- if (r > 0) {
- DMWARN("unimplemented target map return value: %d", r);
- BUG();
- }
-
+ case DM_MAPIO_KILL:
/* The target wants to complete the I/O */
- dm_kill_unmapped_request(rq, r);
+ dm_kill_unmapped_request(rq, BLK_STS_IOERR);
+ break;
+ default:
+ DMWARN("unimplemented target map return value: %d", r);
+ BUG();
}
return r;
@@ -716,6 +549,29 @@ static void dm_start_request(struct mapped_device *md, struct request *orig)
dm_get(md);
}
+static int __dm_rq_init_rq(struct mapped_device *md, struct request *rq)
+{
+ struct dm_rq_target_io *tio = blk_mq_rq_to_pdu(rq);
+
+ /*
+ * Must initialize md member of tio, otherwise it won't
+ * be available in dm_mq_queue_rq.
+ */
+ tio->md = md;
+
+ if (md->init_tio_pdu) {
+ /* target-specific per-io data is immediately after the tio */
+ tio->info.ptr = tio + 1;
+ }
+
+ return 0;
+}
+
+static int dm_rq_init_rq(struct request_queue *q, struct request *rq, gfp_t gfp)
+{
+ return __dm_rq_init_rq(q->rq_alloc_data, rq);
+}
+
static void map_tio_request(struct kthread_work *work)
{
struct dm_rq_target_io *tio = container_of(work, struct dm_rq_target_io, work);
@@ -814,6 +670,7 @@ static void dm_old_request_fn(struct request_queue *q)
dm_start_request(md, rq);
tio = tio_from_request(rq);
+ init_tio(tio, rq, md);
/* Establish tio->ti before queuing work (map_tio_request) */
tio->ti = ti;
kthread_queue_work(&md->kworker, &tio->work);
@@ -824,10 +681,23 @@ static void dm_old_request_fn(struct request_queue *q)
/*
* Fully initialize a .request_fn request-based queue.
*/
-int dm_old_init_request_queue(struct mapped_device *md)
+int dm_old_init_request_queue(struct mapped_device *md, struct dm_table *t)
{
+ struct dm_target *immutable_tgt;
+
/* Fully initialize the queue */
- if (!blk_init_allocated_queue(md->queue, dm_old_request_fn, NULL))
+ md->queue->cmd_size = sizeof(struct dm_rq_target_io);
+ md->queue->rq_alloc_data = md;
+ md->queue->request_fn = dm_old_request_fn;
+ md->queue->init_rq_fn = dm_rq_init_rq;
+
+ immutable_tgt = dm_table_get_immutable_target(t);
+ if (immutable_tgt && immutable_tgt->per_io_data_size) {
+ /* any target-specific per-io data is immediately after the tio */
+ md->queue->cmd_size += immutable_tgt->per_io_data_size;
+ md->init_tio_pdu = true;
+ }
+ if (blk_init_allocated_queue(md->queue) < 0)
return -EINVAL;
/* disable dm_old_request_fn's merge heuristic by default */
@@ -835,7 +705,6 @@ int dm_old_init_request_queue(struct mapped_device *md)
dm_init_normal_md_queue(md);
blk_queue_softirq_done(md->queue, dm_softirq_done);
- blk_queue_prep_rq(md->queue, dm_old_prep_fn);
/* Initialize the request-based DM worker thread */
kthread_init_worker(&md->kworker);
@@ -852,28 +721,13 @@ int dm_old_init_request_queue(struct mapped_device *md)
return 0;
}
-static int dm_mq_init_request(void *data, struct request *rq,
- unsigned int hctx_idx, unsigned int request_idx,
- unsigned int numa_node)
+static int dm_mq_init_request(struct blk_mq_tag_set *set, struct request *rq,
+ unsigned int hctx_idx, unsigned int numa_node)
{
- struct mapped_device *md = data;
- struct dm_rq_target_io *tio = blk_mq_rq_to_pdu(rq);
-
- /*
- * Must initialize md member of tio, otherwise it won't
- * be available in dm_mq_queue_rq.
- */
- tio->md = md;
-
- if (md->init_tio_pdu) {
- /* target-specific per-io data is immediately after the tio */
- tio->info.ptr = tio + 1;
- }
-
- return 0;
+ return __dm_rq_init_rq(set->driver_data, rq);
}
-static int dm_mq_queue_rq(struct blk_mq_hw_ctx *hctx,
+static blk_status_t dm_mq_queue_rq(struct blk_mq_hw_ctx *hctx,
const struct blk_mq_queue_data *bd)
{
struct request *rq = bd->rq;
@@ -890,7 +744,7 @@ static int dm_mq_queue_rq(struct blk_mq_hw_ctx *hctx,
}
if (ti->type->busy && ti->type->busy(ti))
- return BLK_MQ_RQ_QUEUE_BUSY;
+ return BLK_STS_RESOURCE;
dm_start_request(md, rq);
@@ -907,13 +761,14 @@ static int dm_mq_queue_rq(struct blk_mq_hw_ctx *hctx,
/* Undo dm_start_request() before requeuing */
rq_end_stats(md, rq);
rq_completed(md, rq_data_dir(rq), false);
- return BLK_MQ_RQ_QUEUE_BUSY;
+ blk_mq_delay_run_hw_queue(hctx, 100/*ms*/);
+ return BLK_STS_RESOURCE;
}
- return BLK_MQ_RQ_QUEUE_OK;
+ return BLK_STS_OK;
}
-static struct blk_mq_ops dm_mq_ops = {
+static const struct blk_mq_ops dm_mq_ops = {
.queue_rq = dm_mq_queue_rq,
.complete = dm_softirq_done,
.init_request = dm_mq_init_request,
@@ -961,10 +816,14 @@ int dm_mq_init_request_queue(struct mapped_device *md, struct dm_table *t)
dm_init_md_queue(md);
/* backfill 'mq' sysfs registration normally done in blk_register_queue */
- blk_mq_register_dev(disk_to_dev(md->disk), q);
+ err = blk_mq_register_dev(disk_to_dev(md->disk), q);
+ if (err)
+ goto out_cleanup_queue;
return 0;
+out_cleanup_queue:
+ blk_cleanup_queue(q);
out_tag_set:
blk_mq_free_tag_set(md->tag_set);
out_kfree_tag_set:
diff --git a/drivers/md/dm-rq.h b/drivers/md/dm-rq.h
index 4da06cae7bad..9813922e4fe5 100644
--- a/drivers/md/dm-rq.h
+++ b/drivers/md/dm-rq.h
@@ -24,7 +24,7 @@ struct dm_rq_target_io {
struct dm_target *ti;
struct request *orig, *clone;
struct kthread_work work;
- int error;
+ blk_status_t error;
union map_info info;
struct dm_stats_aux stats_aux;
unsigned long duration_jiffies;
@@ -48,7 +48,7 @@ struct dm_rq_clone_bio_info {
bool dm_use_blk_mq_default(void);
bool dm_use_blk_mq(struct mapped_device *md);
-int dm_old_init_request_queue(struct mapped_device *md);
+int dm_old_init_request_queue(struct mapped_device *md, struct dm_table *t);
int dm_mq_init_request_queue(struct mapped_device *md, struct dm_table *t);
void dm_mq_cleanup_mapped_device(struct mapped_device *md);
diff --git a/drivers/md/dm-snap-persistent.c b/drivers/md/dm-snap-persistent.c
index b93476c3ba3f..c5534d294773 100644
--- a/drivers/md/dm-snap-persistent.c
+++ b/drivers/md/dm-snap-persistent.c
@@ -741,7 +741,8 @@ static void persistent_commit_exception(struct dm_exception_store *store,
/*
* Commit exceptions to disk.
*/
- if (ps->valid && area_io(ps, REQ_OP_WRITE, REQ_PREFLUSH | REQ_FUA))
+ if (ps->valid && area_io(ps, REQ_OP_WRITE,
+ REQ_PREFLUSH | REQ_FUA | REQ_SYNC))
ps->valid = 0;
/*
diff --git a/drivers/md/dm-snap.c b/drivers/md/dm-snap.c
index c65feeada864..1ba41048b438 100644
--- a/drivers/md/dm-snap.c
+++ b/drivers/md/dm-snap.c
@@ -1590,7 +1590,7 @@ static void full_bio_end_io(struct bio *bio)
{
void *callback_data = bio->bi_private;
- dm_kcopyd_do_callback(callback_data, 0, bio->bi_error ? 1 : 0);
+ dm_kcopyd_do_callback(callback_data, 0, bio->bi_status ? 1 : 0);
}
static void start_full_bio(struct dm_snap_pending_exception *pe,
@@ -1690,7 +1690,7 @@ static int snapshot_map(struct dm_target *ti, struct bio *bio)
/* Full snapshots are not usable */
/* To get here the table must be live so s->active is always set. */
if (!s->valid)
- return -EIO;
+ return DM_MAPIO_KILL;
/* FIXME: should only take write lock if we need
* to copy an exception */
@@ -1698,7 +1698,7 @@ static int snapshot_map(struct dm_target *ti, struct bio *bio)
if (!s->valid || (unlikely(s->snapshot_overflowed) &&
bio_data_dir(bio) == WRITE)) {
- r = -EIO;
+ r = DM_MAPIO_KILL;
goto out_unlock;
}
@@ -1723,7 +1723,7 @@ static int snapshot_map(struct dm_target *ti, struct bio *bio)
if (!s->valid || s->snapshot_overflowed) {
free_pending_exception(pe);
- r = -EIO;
+ r = DM_MAPIO_KILL;
goto out_unlock;
}
@@ -1741,7 +1741,7 @@ static int snapshot_map(struct dm_target *ti, struct bio *bio)
DMERR("Snapshot overflowed: Unable to allocate exception.");
} else
__invalidate_snapshot(s, -ENOMEM);
- r = -EIO;
+ r = DM_MAPIO_KILL;
goto out_unlock;
}
}
@@ -1851,14 +1851,15 @@ out_unlock:
return r;
}
-static int snapshot_end_io(struct dm_target *ti, struct bio *bio, int error)
+static int snapshot_end_io(struct dm_target *ti, struct bio *bio,
+ blk_status_t *error)
{
struct dm_snapshot *s = ti->private;
if (is_bio_tracked(bio))
stop_tracking_chunk(s, bio);
- return 0;
+ return DM_ENDIO_DONE;
}
static void snapshot_merge_presuspend(struct dm_target *ti)
@@ -2302,8 +2303,8 @@ static int origin_map(struct dm_target *ti, struct bio *bio)
return do_origin(o->dev, bio);
}
-static long origin_direct_access(struct dm_target *ti, sector_t sector,
- void **kaddr, pfn_t *pfn, long size)
+static long origin_dax_direct_access(struct dm_target *ti, pgoff_t pgoff,
+ long nr_pages, void **kaddr, pfn_t *pfn)
{
DMWARN("device does not support dax.");
return -EIO;
@@ -2368,7 +2369,7 @@ static struct target_type origin_target = {
.postsuspend = origin_postsuspend,
.status = origin_status,
.iterate_devices = origin_iterate_devices,
- .direct_access = origin_direct_access,
+ .direct_access = origin_dax_direct_access,
};
static struct target_type snapshot_target = {
diff --git a/drivers/md/dm-stats.c b/drivers/md/dm-stats.c
index 38b05f23b96c..6028d8247f58 100644
--- a/drivers/md/dm-stats.c
+++ b/drivers/md/dm-stats.c
@@ -146,12 +146,7 @@ static void *dm_kvzalloc(size_t alloc_size, int node)
if (!claim_shared_memory(alloc_size))
return NULL;
- if (alloc_size <= KMALLOC_MAX_SIZE) {
- p = kzalloc_node(alloc_size, GFP_KERNEL | __GFP_NORETRY | __GFP_NOMEMALLOC | __GFP_NOWARN, node);
- if (p)
- return p;
- }
- p = vzalloc_node(alloc_size, node);
+ p = kvzalloc_node(alloc_size, GFP_KERNEL | __GFP_NOMEMALLOC, node);
if (p)
return p;
@@ -175,6 +170,7 @@ static void dm_stat_free(struct rcu_head *head)
int cpu;
struct dm_stat *s = container_of(head, struct dm_stat, rcu_head);
+ kfree(s->histogram_boundaries);
kfree(s->program_id);
kfree(s->aux_data);
for_each_possible_cpu(cpu) {
diff --git a/drivers/md/dm-stripe.c b/drivers/md/dm-stripe.c
index 28193a57bf47..a0375530b07f 100644
--- a/drivers/md/dm-stripe.c
+++ b/drivers/md/dm-stripe.c
@@ -11,6 +11,7 @@
#include <linux/init.h>
#include <linux/blkdev.h>
#include <linux/bio.h>
+#include <linux/dax.h>
#include <linux/slab.h>
#include <linux/log2.h>
@@ -169,6 +170,7 @@ static int stripe_ctr(struct dm_target *ti, unsigned int argc, char **argv)
ti->num_flush_bios = stripes;
ti->num_discard_bios = stripes;
ti->num_write_same_bios = stripes;
+ ti->num_write_zeroes_bios = stripes;
sc->chunk_size = chunk_size;
if (chunk_size & (chunk_size - 1))
@@ -293,6 +295,7 @@ static int stripe_map(struct dm_target *ti, struct bio *bio)
return DM_MAPIO_REMAPPED;
}
if (unlikely(bio_op(bio) == REQ_OP_DISCARD) ||
+ unlikely(bio_op(bio) == REQ_OP_WRITE_ZEROES) ||
unlikely(bio_op(bio) == REQ_OP_WRITE_SAME)) {
target_bio_nr = dm_bio_get_target_bio_nr(bio);
BUG_ON(target_bio_nr >= sc->stripes);
@@ -308,27 +311,63 @@ static int stripe_map(struct dm_target *ti, struct bio *bio)
return DM_MAPIO_REMAPPED;
}
-static long stripe_direct_access(struct dm_target *ti, sector_t sector,
- void **kaddr, pfn_t *pfn, long size)
+static long stripe_dax_direct_access(struct dm_target *ti, pgoff_t pgoff,
+ long nr_pages, void **kaddr, pfn_t *pfn)
{
+ sector_t dev_sector, sector = pgoff * PAGE_SECTORS;
struct stripe_c *sc = ti->private;
- uint32_t stripe;
+ struct dax_device *dax_dev;
struct block_device *bdev;
- struct blk_dax_ctl dax = {
- .size = size,
- };
+ uint32_t stripe;
long ret;
- stripe_map_sector(sc, sector, &stripe, &dax.sector);
+ stripe_map_sector(sc, sector, &stripe, &dev_sector);
+ dev_sector += sc->stripe[stripe].physical_start;
+ dax_dev = sc->stripe[stripe].dev->dax_dev;
+ bdev = sc->stripe[stripe].dev->bdev;
+
+ ret = bdev_dax_pgoff(bdev, dev_sector, nr_pages * PAGE_SIZE, &pgoff);
+ if (ret)
+ return ret;
+ return dax_direct_access(dax_dev, pgoff, nr_pages, kaddr, pfn);
+}
+
+static size_t stripe_dax_copy_from_iter(struct dm_target *ti, pgoff_t pgoff,
+ void *addr, size_t bytes, struct iov_iter *i)
+{
+ sector_t dev_sector, sector = pgoff * PAGE_SECTORS;
+ struct stripe_c *sc = ti->private;
+ struct dax_device *dax_dev;
+ struct block_device *bdev;
+ uint32_t stripe;
- dax.sector += sc->stripe[stripe].physical_start;
+ stripe_map_sector(sc, sector, &stripe, &dev_sector);
+ dev_sector += sc->stripe[stripe].physical_start;
+ dax_dev = sc->stripe[stripe].dev->dax_dev;
bdev = sc->stripe[stripe].dev->bdev;
- ret = bdev_direct_access(bdev, &dax);
- *kaddr = dax.addr;
- *pfn = dax.pfn;
+ if (bdev_dax_pgoff(bdev, dev_sector, ALIGN(bytes, PAGE_SIZE), &pgoff))
+ return 0;
+ return dax_copy_from_iter(dax_dev, pgoff, addr, bytes, i);
+}
- return ret;
+static void stripe_dax_flush(struct dm_target *ti, pgoff_t pgoff, void *addr,
+ size_t size)
+{
+ sector_t dev_sector, sector = pgoff * PAGE_SECTORS;
+ struct stripe_c *sc = ti->private;
+ struct dax_device *dax_dev;
+ struct block_device *bdev;
+ uint32_t stripe;
+
+ stripe_map_sector(sc, sector, &stripe, &dev_sector);
+ dev_sector += sc->stripe[stripe].physical_start;
+ dax_dev = sc->stripe[stripe].dev->dax_dev;
+ bdev = sc->stripe[stripe].dev->bdev;
+
+ if (bdev_dax_pgoff(bdev, dev_sector, ALIGN(size, PAGE_SIZE), &pgoff))
+ return;
+ dax_flush(dax_dev, pgoff, addr, size);
}
/*
@@ -374,20 +413,21 @@ static void stripe_status(struct dm_target *ti, status_type_t type,
}
}
-static int stripe_end_io(struct dm_target *ti, struct bio *bio, int error)
+static int stripe_end_io(struct dm_target *ti, struct bio *bio,
+ blk_status_t *error)
{
unsigned i;
char major_minor[16];
struct stripe_c *sc = ti->private;
- if (!error)
- return 0; /* I/O complete */
+ if (!*error)
+ return DM_ENDIO_DONE; /* I/O complete */
- if ((error == -EWOULDBLOCK) && (bio->bi_opf & REQ_RAHEAD))
- return error;
+ if (bio->bi_opf & REQ_RAHEAD)
+ return DM_ENDIO_DONE;
- if (error == -EOPNOTSUPP)
- return error;
+ if (*error == BLK_STS_NOTSUPP)
+ return DM_ENDIO_DONE;
memset(major_minor, 0, sizeof(major_minor));
sprintf(major_minor, "%d:%d",
@@ -408,7 +448,7 @@ static int stripe_end_io(struct dm_target *ti, struct bio *bio, int error)
schedule_work(&sc->trigger_event);
}
- return error;
+ return DM_ENDIO_DONE;
}
static int stripe_iterate_devices(struct dm_target *ti,
@@ -440,6 +480,7 @@ static void stripe_io_hints(struct dm_target *ti,
static struct target_type stripe_target = {
.name = "striped",
.version = {1, 6, 0},
+ .features = DM_TARGET_PASSES_INTEGRITY,
.module = THIS_MODULE,
.ctr = stripe_ctr,
.dtr = stripe_dtr,
@@ -448,7 +489,9 @@ static struct target_type stripe_target = {
.status = stripe_status,
.iterate_devices = stripe_iterate_devices,
.io_hints = stripe_io_hints,
- .direct_access = stripe_direct_access,
+ .direct_access = stripe_dax_direct_access,
+ .dax_copy_from_iter = stripe_dax_copy_from_iter,
+ .dax_flush = stripe_dax_flush,
};
int __init dm_stripe_init(void)
diff --git a/drivers/md/dm-table.c b/drivers/md/dm-table.c
index 0a427de23ed2..a39bcd9b982a 100644
--- a/drivers/md/dm-table.c
+++ b/drivers/md/dm-table.c
@@ -30,7 +30,7 @@
struct dm_table {
struct mapped_device *md;
- unsigned type;
+ enum dm_queue_mode type;
/* btree table */
unsigned int depth;
@@ -47,6 +47,7 @@ struct dm_table {
bool integrity_supported:1;
bool singleton:1;
bool all_blk_mq:1;
+ unsigned integrity_added:1;
/*
* Indicates the rw permissions for the new logical
@@ -318,6 +319,39 @@ static int device_area_is_invalid(struct dm_target *ti, struct dm_dev *dev,
return 1;
}
+ /*
+ * If the target is mapped to zoned block device(s), check
+ * that the zones are not partially mapped.
+ */
+ if (bdev_zoned_model(bdev) != BLK_ZONED_NONE) {
+ unsigned int zone_sectors = bdev_zone_sectors(bdev);
+
+ if (start & (zone_sectors - 1)) {
+ DMWARN("%s: start=%llu not aligned to h/w zone size %u of %s",
+ dm_device_name(ti->table->md),
+ (unsigned long long)start,
+ zone_sectors, bdevname(bdev, b));
+ return 1;
+ }
+
+ /*
+ * Note: The last zone of a zoned block device may be smaller
+ * than other zones. So for a target mapping the end of a
+ * zoned block device with such a zone, len would not be zone
+ * aligned. We do not allow such last smaller zone to be part
+ * of the mapping here to ensure that mappings with multiple
+ * devices do not end up with a smaller zone in the middle of
+ * the sector range.
+ */
+ if (len & (zone_sectors - 1)) {
+ DMWARN("%s: len=%llu not aligned to h/w zone size %u of %s",
+ dm_device_name(ti->table->md),
+ (unsigned long long)len,
+ zone_sectors, bdevname(bdev, b));
+ return 1;
+ }
+ }
+
if (logical_block_size_sectors <= 1)
return 0;
@@ -372,7 +406,7 @@ static int upgrade_mode(struct dm_dev_internal *dd, fmode_t new_mode,
*/
dev_t dm_get_dev_t(const char *path)
{
- dev_t uninitialized_var(dev);
+ dev_t dev;
struct block_device *bdev;
bdev = lookup_bdev(path);
@@ -455,6 +489,8 @@ static int dm_set_device_limits(struct dm_target *ti, struct dm_dev *dev,
q->limits.alignment_offset,
(unsigned long long) start << SECTOR_SHIFT);
+ limits->zoned = blk_queue_zoned_model(q);
+
return 0;
}
@@ -626,13 +662,13 @@ static int validate_hardware_logical_block_alignment(struct dm_table *table,
struct dm_target *uninitialized_var(ti);
struct queue_limits ti_limits;
- unsigned i = 0;
+ unsigned i;
/*
* Check each entry in the table in turn.
*/
- while (i < dm_table_get_num_targets(table)) {
- ti = dm_table_get_target(table, i++);
+ for (i = 0; i < dm_table_get_num_targets(table); i++) {
+ ti = dm_table_get_target(table, i);
blk_set_stacking_limits(&ti_limits);
@@ -725,6 +761,9 @@ int dm_table_add_target(struct dm_table *t, const char *type,
t->immutable_target_type = tgt->type;
}
+ if (dm_target_has_integrity(tgt->type))
+ t->integrity_added = 1;
+
tgt->table = t;
tgt->begin = start;
tgt->len = len;
@@ -821,19 +860,19 @@ void dm_consume_args(struct dm_arg_set *as, unsigned num_args)
}
EXPORT_SYMBOL(dm_consume_args);
-static bool __table_type_bio_based(unsigned table_type)
+static bool __table_type_bio_based(enum dm_queue_mode table_type)
{
return (table_type == DM_TYPE_BIO_BASED ||
table_type == DM_TYPE_DAX_BIO_BASED);
}
-static bool __table_type_request_based(unsigned table_type)
+static bool __table_type_request_based(enum dm_queue_mode table_type)
{
return (table_type == DM_TYPE_REQUEST_BASED ||
table_type == DM_TYPE_MQ_REQUEST_BASED);
}
-void dm_table_set_type(struct dm_table *t, unsigned type)
+void dm_table_set_type(struct dm_table *t, enum dm_queue_mode type)
{
t->type = type;
}
@@ -850,11 +889,11 @@ static int device_supports_dax(struct dm_target *ti, struct dm_dev *dev,
static bool dm_table_supports_dax(struct dm_table *t)
{
struct dm_target *ti;
- unsigned i = 0;
+ unsigned i;
/* Ensure that all targets support DAX. */
- while (i < dm_table_get_num_targets(t)) {
- ti = dm_table_get_target(t, i++);
+ for (i = 0; i < dm_table_get_num_targets(t); i++) {
+ ti = dm_table_get_target(t, i);
if (!ti->type->direct_access)
return false;
@@ -875,7 +914,7 @@ static int dm_table_determine_type(struct dm_table *t)
struct dm_target *tgt;
struct dm_dev_internal *dd;
struct list_head *devices = dm_table_get_devices(t);
- unsigned live_md_type = dm_get_md_type(t->md);
+ enum dm_queue_mode live_md_type = dm_get_md_type(t->md);
if (t->type != DM_TYPE_NONE) {
/* target already set the table's type */
@@ -984,7 +1023,7 @@ verify_rq_based:
return 0;
}
-unsigned dm_table_get_type(struct dm_table *t)
+enum dm_queue_mode dm_table_get_type(struct dm_table *t)
{
return t->type;
}
@@ -1006,11 +1045,11 @@ struct dm_target *dm_table_get_immutable_target(struct dm_table *t)
struct dm_target *dm_table_get_wildcard_target(struct dm_table *t)
{
- struct dm_target *uninitialized_var(ti);
- unsigned i = 0;
+ struct dm_target *ti;
+ unsigned i;
- while (i < dm_table_get_num_targets(t)) {
- ti = dm_table_get_target(t, i++);
+ for (i = 0; i < dm_table_get_num_targets(t); i++) {
+ ti = dm_table_get_target(t, i);
if (dm_target_is_wildcard(ti->type))
return ti;
}
@@ -1035,7 +1074,7 @@ bool dm_table_all_blk_mq_devices(struct dm_table *t)
static int dm_table_alloc_md_mempools(struct dm_table *t, struct mapped_device *md)
{
- unsigned type = dm_table_get_type(t);
+ enum dm_queue_mode type = dm_table_get_type(t);
unsigned per_io_data_size = 0;
struct dm_target *tgt;
unsigned i;
@@ -1131,6 +1170,13 @@ static struct gendisk * dm_table_get_integrity_disk(struct dm_table *t)
struct list_head *devices = dm_table_get_devices(t);
struct dm_dev_internal *dd = NULL;
struct gendisk *prev_disk = NULL, *template_disk = NULL;
+ unsigned i;
+
+ for (i = 0; i < dm_table_get_num_targets(t); i++) {
+ struct dm_target *ti = dm_table_get_target(t, i);
+ if (!dm_target_passes_integrity(ti->type))
+ goto no_integrity;
+ }
list_for_each_entry(dd, devices, list) {
template_disk = dd->dm_dev->bdev->bd_disk;
@@ -1168,6 +1214,10 @@ static int dm_table_register_integrity(struct dm_table *t)
struct mapped_device *md = t->md;
struct gendisk *template_disk = NULL;
+ /* If target handles integrity itself do not register it here. */
+ if (t->integrity_added)
+ return 0;
+
template_disk = dm_table_get_integrity_disk(t);
if (!template_disk)
return 0;
@@ -1313,15 +1363,16 @@ static int count_device(struct dm_target *ti, struct dm_dev *dev,
*/
bool dm_table_has_no_data_devices(struct dm_table *table)
{
- struct dm_target *uninitialized_var(ti);
- unsigned i = 0, num_devices = 0;
+ struct dm_target *ti;
+ unsigned i, num_devices;
- while (i < dm_table_get_num_targets(table)) {
- ti = dm_table_get_target(table, i++);
+ for (i = 0; i < dm_table_get_num_targets(table); i++) {
+ ti = dm_table_get_target(table, i);
if (!ti->type->iterate_devices)
return false;
+ num_devices = 0;
ti->type->iterate_devices(ti, count_device, &num_devices);
if (num_devices)
return false;
@@ -1330,22 +1381,106 @@ bool dm_table_has_no_data_devices(struct dm_table *table)
return true;
}
+static int device_is_zoned_model(struct dm_target *ti, struct dm_dev *dev,
+ sector_t start, sector_t len, void *data)
+{
+ struct request_queue *q = bdev_get_queue(dev->bdev);
+ enum blk_zoned_model *zoned_model = data;
+
+ return q && blk_queue_zoned_model(q) == *zoned_model;
+}
+
+static bool dm_table_supports_zoned_model(struct dm_table *t,
+ enum blk_zoned_model zoned_model)
+{
+ struct dm_target *ti;
+ unsigned i;
+
+ for (i = 0; i < dm_table_get_num_targets(t); i++) {
+ ti = dm_table_get_target(t, i);
+
+ if (zoned_model == BLK_ZONED_HM &&
+ !dm_target_supports_zoned_hm(ti->type))
+ return false;
+
+ if (!ti->type->iterate_devices ||
+ !ti->type->iterate_devices(ti, device_is_zoned_model, &zoned_model))
+ return false;
+ }
+
+ return true;
+}
+
+static int device_matches_zone_sectors(struct dm_target *ti, struct dm_dev *dev,
+ sector_t start, sector_t len, void *data)
+{
+ struct request_queue *q = bdev_get_queue(dev->bdev);
+ unsigned int *zone_sectors = data;
+
+ return q && blk_queue_zone_sectors(q) == *zone_sectors;
+}
+
+static bool dm_table_matches_zone_sectors(struct dm_table *t,
+ unsigned int zone_sectors)
+{
+ struct dm_target *ti;
+ unsigned i;
+
+ for (i = 0; i < dm_table_get_num_targets(t); i++) {
+ ti = dm_table_get_target(t, i);
+
+ if (!ti->type->iterate_devices ||
+ !ti->type->iterate_devices(ti, device_matches_zone_sectors, &zone_sectors))
+ return false;
+ }
+
+ return true;
+}
+
+static int validate_hardware_zoned_model(struct dm_table *table,
+ enum blk_zoned_model zoned_model,
+ unsigned int zone_sectors)
+{
+ if (zoned_model == BLK_ZONED_NONE)
+ return 0;
+
+ if (!dm_table_supports_zoned_model(table, zoned_model)) {
+ DMERR("%s: zoned model is not consistent across all devices",
+ dm_device_name(table->md));
+ return -EINVAL;
+ }
+
+ /* Check zone size validity and compatibility */
+ if (!zone_sectors || !is_power_of_2(zone_sectors))
+ return -EINVAL;
+
+ if (!dm_table_matches_zone_sectors(table, zone_sectors)) {
+ DMERR("%s: zone sectors is not consistent across all devices",
+ dm_device_name(table->md));
+ return -EINVAL;
+ }
+
+ return 0;
+}
+
/*
* Establish the new table's queue_limits and validate them.
*/
int dm_calculate_queue_limits(struct dm_table *table,
struct queue_limits *limits)
{
- struct dm_target *uninitialized_var(ti);
+ struct dm_target *ti;
struct queue_limits ti_limits;
- unsigned i = 0;
+ unsigned i;
+ enum blk_zoned_model zoned_model = BLK_ZONED_NONE;
+ unsigned int zone_sectors = 0;
blk_set_stacking_limits(limits);
- while (i < dm_table_get_num_targets(table)) {
+ for (i = 0; i < dm_table_get_num_targets(table); i++) {
blk_set_stacking_limits(&ti_limits);
- ti = dm_table_get_target(table, i++);
+ ti = dm_table_get_target(table, i);
if (!ti->type->iterate_devices)
goto combine_limits;
@@ -1356,6 +1491,15 @@ int dm_calculate_queue_limits(struct dm_table *table,
ti->type->iterate_devices(ti, dm_set_device_limits,
&ti_limits);
+ if (zoned_model == BLK_ZONED_NONE && ti_limits.zoned != BLK_ZONED_NONE) {
+ /*
+ * After stacking all limits, validate all devices
+ * in table support this zoned model and zone sectors.
+ */
+ zoned_model = ti_limits.zoned;
+ zone_sectors = ti_limits.chunk_sectors;
+ }
+
/* Set I/O hints portion of queue limits */
if (ti->type->io_hints)
ti->type->io_hints(ti, &ti_limits);
@@ -1380,7 +1524,41 @@ combine_limits:
dm_device_name(table->md),
(unsigned long long) ti->begin,
(unsigned long long) ti->len);
+
+ /*
+ * FIXME: this should likely be moved to blk_stack_limits(), would
+ * also eliminate limits->zoned stacking hack in dm_set_device_limits()
+ */
+ if (limits->zoned == BLK_ZONED_NONE && ti_limits.zoned != BLK_ZONED_NONE) {
+ /*
+ * By default, the stacked limits zoned model is set to
+ * BLK_ZONED_NONE in blk_set_stacking_limits(). Update
+ * this model using the first target model reported
+ * that is not BLK_ZONED_NONE. This will be either the
+ * first target device zoned model or the model reported
+ * by the target .io_hints.
+ */
+ limits->zoned = ti_limits.zoned;
+ }
+ }
+
+ /*
+ * Verify that the zoned model and zone sectors, as determined before
+ * any .io_hints override, are the same across all devices in the table.
+ * - this is especially relevant if .io_hints is emulating a disk-managed
+ * zoned model (aka BLK_ZONED_NONE) on host-managed zoned block devices.
+ * BUT...
+ */
+ if (limits->zoned != BLK_ZONED_NONE) {
+ /*
+ * ...IF the above limits stacking determined a zoned model
+ * validate that all of the table's devices conform to it.
+ */
+ zoned_model = limits->zoned;
+ zone_sectors = limits->chunk_sectors;
}
+ if (validate_hardware_zoned_model(table, zoned_model, zone_sectors))
+ return -EINVAL;
return validate_hardware_logical_block_alignment(table, limits);
}
@@ -1394,6 +1572,9 @@ static void dm_table_verify_integrity(struct dm_table *t)
{
struct gendisk *template_disk = NULL;
+ if (t->integrity_added)
+ return;
+
if (t->integrity_supported) {
/*
* Verify that the original integrity profile
@@ -1424,7 +1605,7 @@ static int device_flush_capable(struct dm_target *ti, struct dm_dev *dev,
static bool dm_table_supports_flush(struct dm_table *t, unsigned long flush)
{
struct dm_target *ti;
- unsigned i = 0;
+ unsigned i;
/*
* Require at least one underlying device to support flushes.
@@ -1432,8 +1613,8 @@ static bool dm_table_supports_flush(struct dm_table *t, unsigned long flush)
* so we need to use iterate_devices here, which targets
* supporting flushes must provide.
*/
- while (i < dm_table_get_num_targets(t)) {
- ti = dm_table_get_target(t, i++);
+ for (i = 0; i < dm_table_get_num_targets(t); i++) {
+ ti = dm_table_get_target(t, i);
if (!ti->num_flush_bios)
continue;
@@ -1449,22 +1630,6 @@ static bool dm_table_supports_flush(struct dm_table *t, unsigned long flush)
return false;
}
-static bool dm_table_discard_zeroes_data(struct dm_table *t)
-{
- struct dm_target *ti;
- unsigned i = 0;
-
- /* Ensure that all targets supports discard_zeroes_data. */
- while (i < dm_table_get_num_targets(t)) {
- ti = dm_table_get_target(t, i++);
-
- if (ti->discard_zeroes_data_unsupported)
- return false;
- }
-
- return true;
-}
-
static int device_is_nonrot(struct dm_target *ti, struct dm_dev *dev,
sector_t start, sector_t len, void *data)
{
@@ -1493,10 +1658,10 @@ static bool dm_table_all_devices_attribute(struct dm_table *t,
iterate_devices_callout_fn func)
{
struct dm_target *ti;
- unsigned i = 0;
+ unsigned i;
- while (i < dm_table_get_num_targets(t)) {
- ti = dm_table_get_target(t, i++);
+ for (i = 0; i < dm_table_get_num_targets(t); i++) {
+ ti = dm_table_get_target(t, i);
if (!ti->type->iterate_devices ||
!ti->type->iterate_devices(ti, func, NULL))
@@ -1517,22 +1682,50 @@ static int device_not_write_same_capable(struct dm_target *ti, struct dm_dev *de
static bool dm_table_supports_write_same(struct dm_table *t)
{
struct dm_target *ti;
+ unsigned i;
+
+ for (i = 0; i < dm_table_get_num_targets(t); i++) {
+ ti = dm_table_get_target(t, i);
+
+ if (!ti->num_write_same_bios)
+ return false;
+
+ if (!ti->type->iterate_devices ||
+ ti->type->iterate_devices(ti, device_not_write_same_capable, NULL))
+ return false;
+ }
+
+ return true;
+}
+
+static int device_not_write_zeroes_capable(struct dm_target *ti, struct dm_dev *dev,
+ sector_t start, sector_t len, void *data)
+{
+ struct request_queue *q = bdev_get_queue(dev->bdev);
+
+ return q && !q->limits.max_write_zeroes_sectors;
+}
+
+static bool dm_table_supports_write_zeroes(struct dm_table *t)
+{
+ struct dm_target *ti;
unsigned i = 0;
while (i < dm_table_get_num_targets(t)) {
ti = dm_table_get_target(t, i++);
- if (!ti->num_write_same_bios)
+ if (!ti->num_write_zeroes_bios)
return false;
if (!ti->type->iterate_devices ||
- ti->type->iterate_devices(ti, device_not_write_same_capable, NULL))
+ ti->type->iterate_devices(ti, device_not_write_zeroes_capable, NULL))
return false;
}
return true;
}
+
static int device_discard_capable(struct dm_target *ti, struct dm_dev *dev,
sector_t start, sector_t len, void *data)
{
@@ -1544,7 +1737,7 @@ static int device_discard_capable(struct dm_target *ti, struct dm_dev *dev,
static bool dm_table_supports_discards(struct dm_table *t)
{
struct dm_target *ti;
- unsigned i = 0;
+ unsigned i;
/*
* Unless any target used by the table set discards_supported,
@@ -1553,8 +1746,8 @@ static bool dm_table_supports_discards(struct dm_table *t)
* so we need to use iterate_devices here, which targets
* supporting discard selectively must provide.
*/
- while (i < dm_table_get_num_targets(t)) {
- ti = dm_table_get_target(t, i++);
+ for (i = 0; i < dm_table_get_num_targets(t); i++) {
+ ti = dm_table_get_target(t, i);
if (!ti->num_discard_bios)
continue;
@@ -1592,9 +1785,6 @@ void dm_table_set_restrictions(struct dm_table *t, struct request_queue *q,
}
blk_queue_write_cache(q, wc, fua);
- if (!dm_table_discard_zeroes_data(t))
- q->limits.discard_zeroes_data = 0;
-
/* Ensure that all underlying devices are non-rotational. */
if (dm_table_all_devices_attribute(t, device_is_nonrot))
queue_flag_set_unlocked(QUEUE_FLAG_NONROT, q);
@@ -1603,6 +1793,8 @@ void dm_table_set_restrictions(struct dm_table *t, struct request_queue *q,
if (!dm_table_supports_write_same(t))
q->limits.max_write_same_sectors = 0;
+ if (!dm_table_supports_write_zeroes(t))
+ q->limits.max_write_zeroes_sectors = 0;
if (dm_table_all_devices_attribute(t, queue_supports_sg_merge))
queue_flag_clear_unlocked(QUEUE_FLAG_NO_SG_MERGE, q);
@@ -1661,6 +1853,8 @@ static void suspend_targets(struct dm_table *t, enum suspend_mode mode)
int i = t->num_targets;
struct dm_target *ti = t->targets;
+ lockdep_assert_held(&t->md->suspend_lock);
+
while (i--) {
switch (mode) {
case PRESUSPEND:
@@ -1708,6 +1902,8 @@ int dm_table_resume_targets(struct dm_table *t)
{
int i, r = 0;
+ lockdep_assert_held(&t->md->suspend_lock);
+
for (i = 0; i < t->num_targets; i++) {
struct dm_target *ti = t->targets + i;
@@ -1750,7 +1946,7 @@ int dm_table_any_congested(struct dm_table *t, int bdi_bits)
char b[BDEVNAME_SIZE];
if (likely(q))
- r |= bdi_congested(&q->backing_dev_info, bdi_bits);
+ r |= bdi_congested(q->backing_dev_info, bdi_bits);
else
DMWARN_LIMIT("%s: any_congested: nonexistent device %s",
dm_device_name(t->md),
diff --git a/drivers/md/dm-target.c b/drivers/md/dm-target.c
index 710ae28fd618..c0d7e60820c4 100644
--- a/drivers/md/dm-target.c
+++ b/drivers/md/dm-target.c
@@ -128,28 +128,22 @@ static void io_err_dtr(struct dm_target *tt)
static int io_err_map(struct dm_target *tt, struct bio *bio)
{
- return -EIO;
-}
-
-static int io_err_map_rq(struct dm_target *ti, struct request *clone,
- union map_info *map_context)
-{
- return -EIO;
+ return DM_MAPIO_KILL;
}
static int io_err_clone_and_map_rq(struct dm_target *ti, struct request *rq,
union map_info *map_context,
struct request **clone)
{
- return -EIO;
+ return DM_MAPIO_KILL;
}
static void io_err_release_clone_rq(struct request *clone)
{
}
-static long io_err_direct_access(struct dm_target *ti, sector_t sector,
- void **kaddr, pfn_t *pfn, long size)
+static long io_err_dax_direct_access(struct dm_target *ti, pgoff_t pgoff,
+ long nr_pages, void **kaddr, pfn_t *pfn)
{
return -EIO;
}
@@ -161,10 +155,9 @@ static struct target_type error_target = {
.ctr = io_err_ctr,
.dtr = io_err_dtr,
.map = io_err_map,
- .map_rq = io_err_map_rq,
.clone_and_map_rq = io_err_clone_and_map_rq,
.release_clone_rq = io_err_release_clone_rq,
- .direct_access = io_err_direct_access,
+ .direct_access = io_err_dax_direct_access,
};
int __init dm_target_init(void)
diff --git a/drivers/md/dm-thin-metadata.c b/drivers/md/dm-thin-metadata.c
index a15091a0d40c..d31d18d9727c 100644
--- a/drivers/md/dm-thin-metadata.c
+++ b/drivers/md/dm-thin-metadata.c
@@ -77,7 +77,6 @@
#define THIN_SUPERBLOCK_MAGIC 27022010
#define THIN_SUPERBLOCK_LOCATION 0
#define THIN_VERSION 2
-#define THIN_METADATA_CACHE_SIZE 64
#define SECTOR_TO_BLOCK_SHIFT 3
/*
@@ -485,11 +484,11 @@ static int __write_initial_superblock(struct dm_pool_metadata *pmd)
if (r < 0)
return r;
- r = save_sm_roots(pmd);
+ r = dm_tm_pre_commit(pmd->tm);
if (r < 0)
return r;
- r = dm_tm_pre_commit(pmd->tm);
+ r = save_sm_roots(pmd);
if (r < 0)
return r;
@@ -686,7 +685,6 @@ static int __create_persistent_data_objects(struct dm_pool_metadata *pmd, bool f
int r;
pmd->bm = dm_block_manager_create(pmd->bdev, THIN_METADATA_BLOCK_SIZE << SECTOR_SHIFT,
- THIN_METADATA_CACHE_SIZE,
THIN_MAX_CONCURRENT_LOCKS);
if (IS_ERR(pmd->bm)) {
DMERR("could not create block manager");
diff --git a/drivers/md/dm-thin.c b/drivers/md/dm-thin.c
index d1c05c12a9db..9dec2f8cc739 100644
--- a/drivers/md/dm-thin.c
+++ b/drivers/md/dm-thin.c
@@ -5,7 +5,7 @@
*/
#include "dm-thin-metadata.h"
-#include "dm-bio-prison.h"
+#include "dm-bio-prison-v1.h"
#include "dm.h"
#include <linux/device-mapper.h>
@@ -383,8 +383,8 @@ static void end_discard(struct discard_op *op, int r)
* Even if r is set, there could be sub discards in flight that we
* need to wait for.
*/
- if (r && !op->parent_bio->bi_error)
- op->parent_bio->bi_error = r;
+ if (r && !op->parent_bio->bi_status)
+ op->parent_bio->bi_status = errno_to_blk_status(r);
bio_endio(op->parent_bio);
}
@@ -450,22 +450,20 @@ static void cell_release_no_holder(struct pool *pool,
}
static void cell_error_with_code(struct pool *pool,
- struct dm_bio_prison_cell *cell, int error_code)
+ struct dm_bio_prison_cell *cell, blk_status_t error_code)
{
dm_cell_error(pool->prison, cell, error_code);
dm_bio_prison_free_cell(pool->prison, cell);
}
-static int get_pool_io_error_code(struct pool *pool)
+static blk_status_t get_pool_io_error_code(struct pool *pool)
{
- return pool->out_of_data_space ? -ENOSPC : -EIO;
+ return pool->out_of_data_space ? BLK_STS_NOSPC : BLK_STS_IOERR;
}
static void cell_error(struct pool *pool, struct dm_bio_prison_cell *cell)
{
- int error = get_pool_io_error_code(pool);
-
- cell_error_with_code(pool, cell, error);
+ cell_error_with_code(pool, cell, get_pool_io_error_code(pool));
}
static void cell_success(struct pool *pool, struct dm_bio_prison_cell *cell)
@@ -475,7 +473,7 @@ static void cell_success(struct pool *pool, struct dm_bio_prison_cell *cell)
static void cell_requeue(struct pool *pool, struct dm_bio_prison_cell *cell)
{
- cell_error_with_code(pool, cell, DM_ENDIO_REQUEUE);
+ cell_error_with_code(pool, cell, BLK_STS_DM_REQUEUE);
}
/*----------------------------------------------------------------*/
@@ -555,17 +553,18 @@ static void __merge_bio_list(struct bio_list *bios, struct bio_list *master)
bio_list_init(master);
}
-static void error_bio_list(struct bio_list *bios, int error)
+static void error_bio_list(struct bio_list *bios, blk_status_t error)
{
struct bio *bio;
while ((bio = bio_list_pop(bios))) {
- bio->bi_error = error;
+ bio->bi_status = error;
bio_endio(bio);
}
}
-static void error_thin_bio_list(struct thin_c *tc, struct bio_list *master, int error)
+static void error_thin_bio_list(struct thin_c *tc, struct bio_list *master,
+ blk_status_t error)
{
struct bio_list bios;
unsigned long flags;
@@ -608,11 +607,11 @@ static void requeue_io(struct thin_c *tc)
__merge_bio_list(&bios, &tc->retry_on_resume_list);
spin_unlock_irqrestore(&tc->lock, flags);
- error_bio_list(&bios, DM_ENDIO_REQUEUE);
+ error_bio_list(&bios, BLK_STS_DM_REQUEUE);
requeue_deferred_cells(tc);
}
-static void error_retry_list_with_code(struct pool *pool, int error)
+static void error_retry_list_with_code(struct pool *pool, blk_status_t error)
{
struct thin_c *tc;
@@ -624,9 +623,7 @@ static void error_retry_list_with_code(struct pool *pool, int error)
static void error_retry_list(struct pool *pool)
{
- int error = get_pool_io_error_code(pool);
-
- error_retry_list_with_code(pool, error);
+ error_retry_list_with_code(pool, get_pool_io_error_code(pool));
}
/*
@@ -699,7 +696,7 @@ static void remap_to_origin(struct thin_c *tc, struct bio *bio)
static int bio_triggers_commit(struct thin_c *tc, struct bio *bio)
{
- return (bio->bi_opf & (REQ_PREFLUSH | REQ_FUA)) &&
+ return op_is_flush(bio->bi_opf) &&
dm_thin_changed_this_transaction(tc->td);
}
@@ -774,7 +771,7 @@ struct dm_thin_new_mapping {
*/
atomic_t prepare_actions;
- int err;
+ blk_status_t status;
struct thin_c *tc;
dm_block_t virt_begin, virt_end;
dm_block_t data_block;
@@ -814,7 +811,7 @@ static void copy_complete(int read_err, unsigned long write_err, void *context)
{
struct dm_thin_new_mapping *m = context;
- m->err = read_err || write_err ? -EIO : 0;
+ m->status = read_err || write_err ? BLK_STS_IOERR : 0;
complete_mapping_preparation(m);
}
@@ -825,7 +822,7 @@ static void overwrite_endio(struct bio *bio)
bio->bi_end_io = m->saved_bi_end_io;
- m->err = bio->bi_error;
+ m->status = bio->bi_status;
complete_mapping_preparation(m);
}
@@ -870,8 +867,7 @@ static void __inc_remap_and_issue_cell(void *context,
struct bio *bio;
while ((bio = bio_list_pop(&cell->bios))) {
- if (bio->bi_opf & (REQ_PREFLUSH | REQ_FUA) ||
- bio_op(bio) == REQ_OP_DISCARD)
+ if (op_is_flush(bio->bi_opf) || bio_op(bio) == REQ_OP_DISCARD)
bio_list_add(&info->defer_bios, bio);
else {
inc_all_io_entry(info->tc->pool, bio);
@@ -926,7 +922,7 @@ static void process_prepared_mapping(struct dm_thin_new_mapping *m)
struct bio *bio = m->bio;
int r;
- if (m->err) {
+ if (m->status) {
cell_error(pool, m->cell);
goto out;
}
@@ -1070,6 +1066,7 @@ static void passdown_endio(struct bio *bio)
* to unmap (we ignore err).
*/
queue_passdown_pt2(bio->bi_private);
+ bio_put(bio);
}
static void process_prepared_discard_passdown_pt1(struct dm_thin_new_mapping *m)
@@ -1094,6 +1091,19 @@ static void process_prepared_discard_passdown_pt1(struct dm_thin_new_mapping *m)
return;
}
+ /*
+ * Increment the unmapped blocks. This prevents a race between the
+ * passdown io and reallocation of freed blocks.
+ */
+ r = dm_pool_inc_data_range(pool->pmd, m->data_block, data_end);
+ if (r) {
+ metadata_operation_failed(pool, "dm_pool_inc_data_range", r);
+ bio_io_error(m->bio);
+ cell_defer_no_holder(tc, m->cell);
+ mempool_free(m, pool->mapping_pool);
+ return;
+ }
+
discard_parent = bio_alloc(GFP_NOIO, 1);
if (!discard_parent) {
DMWARN("%s: unable to allocate top level discard bio for passdown. Skipping passdown.",
@@ -1114,19 +1124,6 @@ static void process_prepared_discard_passdown_pt1(struct dm_thin_new_mapping *m)
end_discard(&op, r);
}
}
-
- /*
- * Increment the unmapped blocks. This prevents a race between the
- * passdown io and reallocation of freed blocks.
- */
- r = dm_pool_inc_data_range(pool->pmd, m->data_block, data_end);
- if (r) {
- metadata_operation_failed(pool, "dm_pool_inc_data_range", r);
- bio_io_error(m->bio);
- cell_defer_no_holder(tc, m->cell);
- mempool_free(m, pool->mapping_pool);
- return;
- }
}
static void process_prepared_discard_passdown_pt2(struct dm_thin_new_mapping *m)
@@ -1495,7 +1492,7 @@ static void retry_on_resume(struct bio *bio)
spin_unlock_irqrestore(&tc->lock, flags);
}
-static int should_error_unserviceable_bio(struct pool *pool)
+static blk_status_t should_error_unserviceable_bio(struct pool *pool)
{
enum pool_mode m = get_pool_mode(pool);
@@ -1503,27 +1500,27 @@ static int should_error_unserviceable_bio(struct pool *pool)
case PM_WRITE:
/* Shouldn't get here */
DMERR_LIMIT("bio unserviceable, yet pool is in PM_WRITE mode");
- return -EIO;
+ return BLK_STS_IOERR;
case PM_OUT_OF_DATA_SPACE:
- return pool->pf.error_if_no_space ? -ENOSPC : 0;
+ return pool->pf.error_if_no_space ? BLK_STS_NOSPC : 0;
case PM_READ_ONLY:
case PM_FAIL:
- return -EIO;
+ return BLK_STS_IOERR;
default:
/* Shouldn't get here */
DMERR_LIMIT("bio unserviceable, yet pool has an unknown mode");
- return -EIO;
+ return BLK_STS_IOERR;
}
}
static void handle_unserviceable_bio(struct pool *pool, struct bio *bio)
{
- int error = should_error_unserviceable_bio(pool);
+ blk_status_t error = should_error_unserviceable_bio(pool);
if (error) {
- bio->bi_error = error;
+ bio->bi_status = error;
bio_endio(bio);
} else
retry_on_resume(bio);
@@ -1533,7 +1530,7 @@ static void retry_bios_on_resume(struct pool *pool, struct dm_bio_prison_cell *c
{
struct bio *bio;
struct bio_list bios;
- int error;
+ blk_status_t error;
error = should_error_unserviceable_bio(pool);
if (error) {
@@ -1716,9 +1713,8 @@ static void __remap_and_issue_shared_cell(void *context,
struct bio *bio;
while ((bio = bio_list_pop(&cell->bios))) {
- if ((bio_data_dir(bio) == WRITE) ||
- (bio->bi_opf & (REQ_PREFLUSH | REQ_FUA) ||
- bio_op(bio) == REQ_OP_DISCARD))
+ if (bio_data_dir(bio) == WRITE || op_is_flush(bio->bi_opf) ||
+ bio_op(bio) == REQ_OP_DISCARD)
bio_list_add(&info->defer_bios, bio);
else {
struct dm_thin_endio_hook *h = dm_per_bio_data(bio, sizeof(struct dm_thin_endio_hook));;
@@ -2072,7 +2068,8 @@ static void process_thin_deferred_bios(struct thin_c *tc)
unsigned count = 0;
if (tc->requeue_mode) {
- error_thin_bio_list(tc, &tc->deferred_bio_list, DM_ENDIO_REQUEUE);
+ error_thin_bio_list(tc, &tc->deferred_bio_list,
+ BLK_STS_DM_REQUEUE);
return;
}
@@ -2323,7 +2320,7 @@ static void do_no_space_timeout(struct work_struct *ws)
if (get_pool_mode(pool) == PM_OUT_OF_DATA_SPACE && !pool->pf.error_if_no_space) {
pool->pf.error_if_no_space = true;
notify_of_pool_mode_change_to_oods(pool);
- error_retry_list_with_code(pool, -ENOSPC);
+ error_retry_list_with_code(pool, BLK_STS_NOSPC);
}
}
@@ -2625,7 +2622,7 @@ static int thin_bio_map(struct dm_target *ti, struct bio *bio)
thin_hook_bio(tc, bio);
if (tc->requeue_mode) {
- bio->bi_error = DM_ENDIO_REQUEUE;
+ bio->bi_status = BLK_STS_DM_REQUEUE;
bio_endio(bio);
return DM_MAPIO_SUBMITTED;
}
@@ -2635,8 +2632,7 @@ static int thin_bio_map(struct dm_target *ti, struct bio *bio)
return DM_MAPIO_SUBMITTED;
}
- if (bio->bi_opf & (REQ_PREFLUSH | REQ_FUA) ||
- bio_op(bio) == REQ_OP_DISCARD) {
+ if (op_is_flush(bio->bi_opf) || bio_op(bio) == REQ_OP_DISCARD) {
thin_defer_bio_with_throttle(tc, bio);
return DM_MAPIO_SUBMITTED;
}
@@ -2714,7 +2710,7 @@ static int pool_is_congested(struct dm_target_callbacks *cb, int bdi_bits)
return 1;
q = bdev_get_queue(pt->data_dev->bdev);
- return bdi_congested(&q->backing_dev_info, bdi_bits);
+ return bdi_congested(q->backing_dev_info, bdi_bits);
}
static void requeue_bios(struct pool *pool)
@@ -3266,7 +3262,6 @@ static int pool_ctr(struct dm_target *ti, unsigned argc, char **argv)
* them down to the data device. The thin device's discard
* processing will cause mappings to be removed from the btree.
*/
- ti->discard_zeroes_data_unsupported = true;
if (pf.discard_enabled && pf.discard_passdown) {
ti->num_discard_bios = 1;
@@ -4122,7 +4117,6 @@ static int thin_ctr(struct dm_target *ti, unsigned argc, char **argv)
ti->per_io_data_size = sizeof(struct dm_thin_endio_hook);
/* In case the pool supports discards, pass them on. */
- ti->discard_zeroes_data_unsupported = true;
if (tc->pool->pf.discard_enabled) {
ti->discards_supported = true;
ti->num_discard_bios = 1;
@@ -4181,7 +4175,8 @@ static int thin_map(struct dm_target *ti, struct bio *bio)
return thin_bio_map(ti, bio);
}
-static int thin_endio(struct dm_target *ti, struct bio *bio, int err)
+static int thin_endio(struct dm_target *ti, struct bio *bio,
+ blk_status_t *err)
{
unsigned long flags;
struct dm_thin_endio_hook *h = dm_per_bio_data(bio, sizeof(struct dm_thin_endio_hook));
@@ -4216,7 +4211,7 @@ static int thin_endio(struct dm_target *ti, struct bio *bio, int err)
if (h->cell)
cell_defer_no_holder(h->tc, h->cell);
- return 0;
+ return DM_ENDIO_DONE;
}
static void thin_presuspend(struct dm_target *ti)
diff --git a/drivers/md/dm-verity-fec.c b/drivers/md/dm-verity-fec.c
index 0f0eb8a3d922..504ba3fa328b 100644
--- a/drivers/md/dm-verity-fec.c
+++ b/drivers/md/dm-verity-fec.c
@@ -146,8 +146,6 @@ static int fec_decode_bufs(struct dm_verity *v, struct dm_verity_fec_io *fio,
block = fec_buffer_rs_block(v, fio, n, i);
res = fec_decode_rs8(v, fio, block, &par[offset], neras);
if (res < 0) {
- dm_bufio_release(buf);
-
r = res;
goto error;
}
@@ -172,6 +170,8 @@ static int fec_decode_bufs(struct dm_verity *v, struct dm_verity_fec_io *fio,
done:
r = corrected;
error:
+ dm_bufio_release(buf);
+
if (r < 0 && neras)
DMERR_LIMIT("%s: FEC %llu: failed to correct: %d",
v->data_dev->name, (unsigned long long)rsb, r);
@@ -188,7 +188,7 @@ error:
static int fec_is_erasure(struct dm_verity *v, struct dm_verity_io *io,
u8 *want_digest, u8 *data)
{
- if (unlikely(verity_hash(v, verity_io_hash_desc(v, io),
+ if (unlikely(verity_hash(v, verity_io_hash_req(v, io),
data, 1 << v->data_dev_block_bits,
verity_io_real_digest(v, io))))
return 0;
@@ -269,7 +269,7 @@ static int fec_read_bufs(struct dm_verity *v, struct dm_verity_io *io,
&is_zero) == 0) {
/* skip known zero blocks entirely */
if (is_zero)
- continue;
+ goto done;
/*
* skip if we have already found the theoretical
@@ -397,7 +397,7 @@ static int fec_decode_rsb(struct dm_verity *v, struct dm_verity_io *io,
}
/* Always re-validate the corrected block against the expected hash */
- r = verity_hash(v, verity_io_hash_desc(v, io), fio->output,
+ r = verity_hash(v, verity_io_hash_req(v, io), fio->output,
1 << v->data_dev_block_bits,
verity_io_real_digest(v, io));
if (unlikely(r < 0))
@@ -439,6 +439,13 @@ int verity_fec_decode(struct dm_verity *v, struct dm_verity_io *io,
if (!verity_fec_is_enabled(v))
return -EOPNOTSUPP;
+ if (fio->level >= DM_VERITY_FEC_MAX_RECURSION) {
+ DMWARN_LIMIT("%s: FEC: recursion too deep", v->data_dev->name);
+ return -EIO;
+ }
+
+ fio->level++;
+
if (type == DM_VERITY_BLOCK_TYPE_METADATA)
block += v->data_blocks;
@@ -470,7 +477,7 @@ int verity_fec_decode(struct dm_verity *v, struct dm_verity_io *io,
if (r < 0) {
r = fec_decode_rsb(v, io, fio, rsb, offset, true);
if (r < 0)
- return r;
+ goto done;
}
if (dest)
@@ -480,6 +487,8 @@ int verity_fec_decode(struct dm_verity *v, struct dm_verity_io *io,
r = verity_for_bv_block(v, io, iter, fec_bv_copy);
}
+done:
+ fio->level--;
return r;
}
@@ -520,6 +529,7 @@ void verity_fec_init_io(struct dm_verity_io *io)
memset(fio->bufs, 0, sizeof(fio->bufs));
fio->nbufs = 0;
fio->output = NULL;
+ fio->level = 0;
}
/*
diff --git a/drivers/md/dm-verity-fec.h b/drivers/md/dm-verity-fec.h
index 7fa0298b995e..bb31ce87a933 100644
--- a/drivers/md/dm-verity-fec.h
+++ b/drivers/md/dm-verity-fec.h
@@ -27,6 +27,9 @@
#define DM_VERITY_FEC_BUF_MAX \
(1 << (PAGE_SHIFT - DM_VERITY_FEC_BUF_RS_BITS))
+/* maximum recursion level for verity_fec_decode */
+#define DM_VERITY_FEC_MAX_RECURSION 4
+
#define DM_VERITY_OPT_FEC_DEV "use_fec_from_device"
#define DM_VERITY_OPT_FEC_BLOCKS "fec_blocks"
#define DM_VERITY_OPT_FEC_START "fec_start"
@@ -58,6 +61,7 @@ struct dm_verity_fec_io {
unsigned nbufs; /* number of buffers allocated */
u8 *output; /* buffer for corrected output */
size_t output_pos;
+ unsigned level; /* recursion level */
};
#ifdef CONFIG_DM_VERITY_FEC
diff --git a/drivers/md/dm-verity-target.c b/drivers/md/dm-verity-target.c
index 7335d8a3fc47..b46705ebf01f 100644
--- a/drivers/md/dm-verity-target.c
+++ b/drivers/md/dm-verity-target.c
@@ -93,81 +93,123 @@ static sector_t verity_position_at_level(struct dm_verity *v, sector_t block,
}
/*
- * Wrapper for crypto_shash_init, which handles verity salting.
+ * Callback function for asynchrnous crypto API completion notification
*/
-static int verity_hash_init(struct dm_verity *v, struct shash_desc *desc)
+static void verity_op_done(struct crypto_async_request *base, int err)
{
- int r;
+ struct verity_result *res = (struct verity_result *)base->data;
- desc->tfm = v->tfm;
- desc->flags = CRYPTO_TFM_REQ_MAY_SLEEP;
+ if (err == -EINPROGRESS)
+ return;
- r = crypto_shash_init(desc);
+ res->err = err;
+ complete(&res->completion);
+}
- if (unlikely(r < 0)) {
- DMERR("crypto_shash_init failed: %d", r);
- return r;
- }
+/*
+ * Wait for async crypto API callback
+ */
+static inline int verity_complete_op(struct verity_result *res, int ret)
+{
+ switch (ret) {
+ case 0:
+ break;
- if (likely(v->version >= 1)) {
- r = crypto_shash_update(desc, v->salt, v->salt_size);
+ case -EINPROGRESS:
+ case -EBUSY:
+ ret = wait_for_completion_interruptible(&res->completion);
+ if (!ret)
+ ret = res->err;
+ reinit_completion(&res->completion);
+ break;
- if (unlikely(r < 0)) {
- DMERR("crypto_shash_update failed: %d", r);
- return r;
- }
+ default:
+ DMERR("verity_wait_hash: crypto op submission failed: %d", ret);
}
- return 0;
+ if (unlikely(ret < 0))
+ DMERR("verity_wait_hash: crypto op failed: %d", ret);
+
+ return ret;
}
-static int verity_hash_update(struct dm_verity *v, struct shash_desc *desc,
- const u8 *data, size_t len)
+static int verity_hash_update(struct dm_verity *v, struct ahash_request *req,
+ const u8 *data, size_t len,
+ struct verity_result *res)
{
- int r = crypto_shash_update(desc, data, len);
+ struct scatterlist sg;
- if (unlikely(r < 0))
- DMERR("crypto_shash_update failed: %d", r);
+ sg_init_one(&sg, data, len);
+ ahash_request_set_crypt(req, &sg, NULL, len);
+
+ return verity_complete_op(res, crypto_ahash_update(req));
+}
+
+/*
+ * Wrapper for crypto_ahash_init, which handles verity salting.
+ */
+static int verity_hash_init(struct dm_verity *v, struct ahash_request *req,
+ struct verity_result *res)
+{
+ int r;
+
+ ahash_request_set_tfm(req, v->tfm);
+ ahash_request_set_callback(req, CRYPTO_TFM_REQ_MAY_SLEEP |
+ CRYPTO_TFM_REQ_MAY_BACKLOG,
+ verity_op_done, (void *)res);
+ init_completion(&res->completion);
+
+ r = verity_complete_op(res, crypto_ahash_init(req));
+
+ if (unlikely(r < 0)) {
+ DMERR("crypto_ahash_init failed: %d", r);
+ return r;
+ }
+
+ if (likely(v->salt_size && (v->version >= 1)))
+ r = verity_hash_update(v, req, v->salt, v->salt_size, res);
return r;
}
-static int verity_hash_final(struct dm_verity *v, struct shash_desc *desc,
- u8 *digest)
+static int verity_hash_final(struct dm_verity *v, struct ahash_request *req,
+ u8 *digest, struct verity_result *res)
{
int r;
- if (unlikely(!v->version)) {
- r = crypto_shash_update(desc, v->salt, v->salt_size);
+ if (unlikely(v->salt_size && (!v->version))) {
+ r = verity_hash_update(v, req, v->salt, v->salt_size, res);
if (r < 0) {
- DMERR("crypto_shash_update failed: %d", r);
- return r;
+ DMERR("verity_hash_final failed updating salt: %d", r);
+ goto out;
}
}
- r = crypto_shash_final(desc, digest);
-
- if (unlikely(r < 0))
- DMERR("crypto_shash_final failed: %d", r);
-
+ ahash_request_set_crypt(req, NULL, digest, 0);
+ r = verity_complete_op(res, crypto_ahash_final(req));
+out:
return r;
}
-int verity_hash(struct dm_verity *v, struct shash_desc *desc,
+int verity_hash(struct dm_verity *v, struct ahash_request *req,
const u8 *data, size_t len, u8 *digest)
{
int r;
+ struct verity_result res;
- r = verity_hash_init(v, desc);
+ r = verity_hash_init(v, req, &res);
if (unlikely(r < 0))
- return r;
+ goto out;
- r = verity_hash_update(v, desc, data, len);
+ r = verity_hash_update(v, req, data, len, &res);
if (unlikely(r < 0))
- return r;
+ goto out;
+
+ r = verity_hash_final(v, req, digest, &res);
- return verity_hash_final(v, desc, digest);
+out:
+ return r;
}
static void verity_hash_at_level(struct dm_verity *v, sector_t block, int level,
@@ -275,7 +317,7 @@ static int verity_verify_level(struct dm_verity *v, struct dm_verity_io *io,
goto release_ret_r;
}
- r = verity_hash(v, verity_io_hash_desc(v, io),
+ r = verity_hash(v, verity_io_hash_req(v, io),
data, 1 << v->hash_dev_block_bits,
verity_io_real_digest(v, io));
if (unlikely(r < 0))
@@ -344,6 +386,49 @@ out:
}
/*
+ * Calculates the digest for the given bio
+ */
+int verity_for_io_block(struct dm_verity *v, struct dm_verity_io *io,
+ struct bvec_iter *iter, struct verity_result *res)
+{
+ unsigned int todo = 1 << v->data_dev_block_bits;
+ struct bio *bio = dm_bio_from_per_bio_data(io, v->ti->per_io_data_size);
+ struct scatterlist sg;
+ struct ahash_request *req = verity_io_hash_req(v, io);
+
+ do {
+ int r;
+ unsigned int len;
+ struct bio_vec bv = bio_iter_iovec(bio, *iter);
+
+ sg_init_table(&sg, 1);
+
+ len = bv.bv_len;
+
+ if (likely(len >= todo))
+ len = todo;
+ /*
+ * Operating on a single page at a time looks suboptimal
+ * until you consider the typical block size is 4,096B.
+ * Going through this loops twice should be very rare.
+ */
+ sg_set_page(&sg, bv.bv_page, len, bv.bv_offset);
+ ahash_request_set_crypt(req, &sg, NULL, len);
+ r = verity_complete_op(res, crypto_ahash_update(req));
+
+ if (unlikely(r < 0)) {
+ DMERR("verity_for_io_block crypto op failed: %d", r);
+ return r;
+ }
+
+ bio_advance_iter(bio, iter, len);
+ todo -= len;
+ } while (todo);
+
+ return 0;
+}
+
+/*
* Calls function process for 1 << v->data_dev_block_bits bytes in the bio_vec
* starting from iter.
*/
@@ -381,12 +466,6 @@ int verity_for_bv_block(struct dm_verity *v, struct dm_verity_io *io,
return 0;
}
-static int verity_bv_hash_update(struct dm_verity *v, struct dm_verity_io *io,
- u8 *data, size_t len)
-{
- return verity_hash_update(v, verity_io_hash_desc(v, io), data, len);
-}
-
static int verity_bv_zero(struct dm_verity *v, struct dm_verity_io *io,
u8 *data, size_t len)
{
@@ -403,10 +482,11 @@ static int verity_verify_io(struct dm_verity_io *io)
struct dm_verity *v = io->v;
struct bvec_iter start;
unsigned b;
+ struct verity_result res;
for (b = 0; b < io->n_blocks; b++) {
int r;
- struct shash_desc *desc = verity_io_hash_desc(v, io);
+ struct ahash_request *req = verity_io_hash_req(v, io);
r = verity_hash_for_block(v, io, io->block + b,
verity_io_want_digest(v, io),
@@ -427,16 +507,17 @@ static int verity_verify_io(struct dm_verity_io *io)
continue;
}
- r = verity_hash_init(v, desc);
+ r = verity_hash_init(v, req, &res);
if (unlikely(r < 0))
return r;
start = io->iter;
- r = verity_for_bv_block(v, io, &io->iter, verity_bv_hash_update);
+ r = verity_for_io_block(v, io, &io->iter, &res);
if (unlikely(r < 0))
return r;
- r = verity_hash_final(v, desc, verity_io_real_digest(v, io));
+ r = verity_hash_final(v, req, verity_io_real_digest(v, io),
+ &res);
if (unlikely(r < 0))
return r;
@@ -457,13 +538,13 @@ static int verity_verify_io(struct dm_verity_io *io)
/*
* End one "io" structure with a given error.
*/
-static void verity_finish_io(struct dm_verity_io *io, int error)
+static void verity_finish_io(struct dm_verity_io *io, blk_status_t status)
{
struct dm_verity *v = io->v;
struct bio *bio = dm_bio_from_per_bio_data(io, v->ti->per_io_data_size);
bio->bi_end_io = io->orig_bi_end_io;
- bio->bi_error = error;
+ bio->bi_status = status;
verity_fec_finish_io(io);
@@ -474,15 +555,15 @@ static void verity_work(struct work_struct *w)
{
struct dm_verity_io *io = container_of(w, struct dm_verity_io, work);
- verity_finish_io(io, verity_verify_io(io));
+ verity_finish_io(io, errno_to_blk_status(verity_verify_io(io)));
}
static void verity_end_io(struct bio *bio)
{
struct dm_verity_io *io = bio->bi_private;
- if (bio->bi_error && !verity_fec_is_enabled(io->v)) {
- verity_finish_io(io, bio->bi_error);
+ if (bio->bi_status && !verity_fec_is_enabled(io->v)) {
+ verity_finish_io(io, bio->bi_status);
return;
}
@@ -562,17 +643,17 @@ static int verity_map(struct dm_target *ti, struct bio *bio)
if (((unsigned)bio->bi_iter.bi_sector | bio_sectors(bio)) &
((1 << (v->data_dev_block_bits - SECTOR_SHIFT)) - 1)) {
DMERR_LIMIT("unaligned io");
- return -EIO;
+ return DM_MAPIO_KILL;
}
if (bio_end_sector(bio) >>
(v->data_dev_block_bits - SECTOR_SHIFT) > v->data_blocks) {
DMERR_LIMIT("io out of range");
- return -EIO;
+ return DM_MAPIO_KILL;
}
if (bio_data_dir(bio) == WRITE)
- return -EIO;
+ return DM_MAPIO_KILL;
io = dm_per_bio_data(bio, ti->per_io_data_size);
io->v = v;
@@ -705,7 +786,7 @@ static void verity_dtr(struct dm_target *ti)
kfree(v->zero_digest);
if (v->tfm)
- crypto_free_shash(v->tfm);
+ crypto_free_ahash(v->tfm);
kfree(v->alg_name);
@@ -723,7 +804,7 @@ static void verity_dtr(struct dm_target *ti)
static int verity_alloc_zero_digest(struct dm_verity *v)
{
int r = -ENOMEM;
- struct shash_desc *desc;
+ struct ahash_request *req;
u8 *zero_data;
v->zero_digest = kmalloc(v->digest_size, GFP_KERNEL);
@@ -731,9 +812,9 @@ static int verity_alloc_zero_digest(struct dm_verity *v)
if (!v->zero_digest)
return r;
- desc = kmalloc(v->shash_descsize, GFP_KERNEL);
+ req = kmalloc(v->ahash_reqsize, GFP_KERNEL);
- if (!desc)
+ if (!req)
return r; /* verity_dtr will free zero_digest */
zero_data = kzalloc(1 << v->data_dev_block_bits, GFP_KERNEL);
@@ -741,11 +822,11 @@ static int verity_alloc_zero_digest(struct dm_verity *v)
if (!zero_data)
goto out;
- r = verity_hash(v, desc, zero_data, 1 << v->data_dev_block_bits,
+ r = verity_hash(v, req, zero_data, 1 << v->data_dev_block_bits,
v->zero_digest);
out:
- kfree(desc);
+ kfree(req);
kfree(zero_data);
return r;
@@ -923,21 +1004,21 @@ static int verity_ctr(struct dm_target *ti, unsigned argc, char **argv)
goto bad;
}
- v->tfm = crypto_alloc_shash(v->alg_name, 0, 0);
+ v->tfm = crypto_alloc_ahash(v->alg_name, 0, 0);
if (IS_ERR(v->tfm)) {
ti->error = "Cannot initialize hash function";
r = PTR_ERR(v->tfm);
v->tfm = NULL;
goto bad;
}
- v->digest_size = crypto_shash_digestsize(v->tfm);
+ v->digest_size = crypto_ahash_digestsize(v->tfm);
if ((1 << v->hash_dev_block_bits) < v->digest_size * 2) {
ti->error = "Digest size too big";
r = -EINVAL;
goto bad;
}
- v->shash_descsize =
- sizeof(struct shash_desc) + crypto_shash_descsize(v->tfm);
+ v->ahash_reqsize = sizeof(struct ahash_request) +
+ crypto_ahash_reqsize(v->tfm);
v->root_digest = kmalloc(v->digest_size, GFP_KERNEL);
if (!v->root_digest) {
@@ -1037,7 +1118,7 @@ static int verity_ctr(struct dm_target *ti, unsigned argc, char **argv)
}
ti->per_io_data_size = sizeof(struct dm_verity_io) +
- v->shash_descsize + v->digest_size * 2;
+ v->ahash_reqsize + v->digest_size * 2;
r = verity_fec_ctr(v);
if (r)
diff --git a/drivers/md/dm-verity.h b/drivers/md/dm-verity.h
index fb419f422d73..a59e0ada6fd3 100644
--- a/drivers/md/dm-verity.h
+++ b/drivers/md/dm-verity.h
@@ -37,7 +37,7 @@ struct dm_verity {
struct dm_target *ti;
struct dm_bufio_client *bufio;
char *alg_name;
- struct crypto_shash *tfm;
+ struct crypto_ahash *tfm;
u8 *root_digest; /* digest of the root block */
u8 *salt; /* salt: its size is salt_size */
u8 *zero_digest; /* digest for a zero block */
@@ -52,7 +52,7 @@ struct dm_verity {
unsigned char levels; /* the number of tree levels */
unsigned char version;
unsigned digest_size; /* digest size for the current hash algorithm */
- unsigned shash_descsize;/* the size of temporary space for crypto */
+ unsigned int ahash_reqsize;/* the size of temporary space for crypto */
int hash_failed; /* set to 1 if hash of any block failed */
enum verity_mode mode; /* mode for handling verification errors */
unsigned corrupted_errs;/* Number of errors for corrupted blocks */
@@ -81,31 +81,36 @@ struct dm_verity_io {
/*
* Three variably-size fields follow this struct:
*
- * u8 hash_desc[v->shash_descsize];
+ * u8 hash_req[v->ahash_reqsize];
* u8 real_digest[v->digest_size];
* u8 want_digest[v->digest_size];
*
- * To access them use: verity_io_hash_desc(), verity_io_real_digest()
+ * To access them use: verity_io_hash_req(), verity_io_real_digest()
* and verity_io_want_digest().
*/
};
-static inline struct shash_desc *verity_io_hash_desc(struct dm_verity *v,
+struct verity_result {
+ struct completion completion;
+ int err;
+};
+
+static inline struct ahash_request *verity_io_hash_req(struct dm_verity *v,
struct dm_verity_io *io)
{
- return (struct shash_desc *)(io + 1);
+ return (struct ahash_request *)(io + 1);
}
static inline u8 *verity_io_real_digest(struct dm_verity *v,
struct dm_verity_io *io)
{
- return (u8 *)(io + 1) + v->shash_descsize;
+ return (u8 *)(io + 1) + v->ahash_reqsize;
}
static inline u8 *verity_io_want_digest(struct dm_verity *v,
struct dm_verity_io *io)
{
- return (u8 *)(io + 1) + v->shash_descsize + v->digest_size;
+ return (u8 *)(io + 1) + v->ahash_reqsize + v->digest_size;
}
static inline u8 *verity_io_digest_end(struct dm_verity *v,
@@ -120,7 +125,7 @@ extern int verity_for_bv_block(struct dm_verity *v, struct dm_verity_io *io,
struct dm_verity_io *io,
u8 *data, size_t len));
-extern int verity_hash(struct dm_verity *v, struct shash_desc *desc,
+extern int verity_hash(struct dm_verity *v, struct ahash_request *req,
const u8 *data, size_t len, u8 *digest);
extern int verity_hash_for_block(struct dm_verity *v, struct dm_verity_io *io,
diff --git a/drivers/md/dm-zero.c b/drivers/md/dm-zero.c
index b616f11d8473..b65ca8dcfbdc 100644
--- a/drivers/md/dm-zero.c
+++ b/drivers/md/dm-zero.c
@@ -39,7 +39,7 @@ static int zero_map(struct dm_target *ti, struct bio *bio)
case REQ_OP_READ:
if (bio->bi_opf & REQ_RAHEAD) {
/* readahead of null bytes only wastes buffer cache */
- return -EIO;
+ return DM_MAPIO_KILL;
}
zero_fill_bio(bio);
break;
@@ -47,7 +47,7 @@ static int zero_map(struct dm_target *ti, struct bio *bio)
/* writes get silently dropped */
break;
default:
- return -EIO;
+ return DM_MAPIO_KILL;
}
bio_endio(bio);
diff --git a/drivers/md/dm-zoned-metadata.c b/drivers/md/dm-zoned-metadata.c
new file mode 100644
index 000000000000..884ff7c170a0
--- /dev/null
+++ b/drivers/md/dm-zoned-metadata.c
@@ -0,0 +1,2509 @@
+/*
+ * Copyright (C) 2017 Western Digital Corporation or its affiliates.
+ *
+ * This file is released under the GPL.
+ */
+
+#include "dm-zoned.h"
+
+#include <linux/module.h>
+#include <linux/crc32.h>
+
+#define DM_MSG_PREFIX "zoned metadata"
+
+/*
+ * Metadata version.
+ */
+#define DMZ_META_VER 1
+
+/*
+ * On-disk super block magic.
+ */
+#define DMZ_MAGIC ((((unsigned int)('D')) << 24) | \
+ (((unsigned int)('Z')) << 16) | \
+ (((unsigned int)('B')) << 8) | \
+ ((unsigned int)('D')))
+
+/*
+ * On disk super block.
+ * This uses only 512 B but uses on disk a full 4KB block. This block is
+ * followed on disk by the mapping table of chunks to zones and the bitmap
+ * blocks indicating zone block validity.
+ * The overall resulting metadata format is:
+ * (1) Super block (1 block)
+ * (2) Chunk mapping table (nr_map_blocks)
+ * (3) Bitmap blocks (nr_bitmap_blocks)
+ * All metadata blocks are stored in conventional zones, starting from the
+ * the first conventional zone found on disk.
+ */
+struct dmz_super {
+ /* Magic number */
+ __le32 magic; /* 4 */
+
+ /* Metadata version number */
+ __le32 version; /* 8 */
+
+ /* Generation number */
+ __le64 gen; /* 16 */
+
+ /* This block number */
+ __le64 sb_block; /* 24 */
+
+ /* The number of metadata blocks, including this super block */
+ __le32 nr_meta_blocks; /* 28 */
+
+ /* The number of sequential zones reserved for reclaim */
+ __le32 nr_reserved_seq; /* 32 */
+
+ /* The number of entries in the mapping table */
+ __le32 nr_chunks; /* 36 */
+
+ /* The number of blocks used for the chunk mapping table */
+ __le32 nr_map_blocks; /* 40 */
+
+ /* The number of blocks used for the block bitmaps */
+ __le32 nr_bitmap_blocks; /* 44 */
+
+ /* Checksum */
+ __le32 crc; /* 48 */
+
+ /* Padding to full 512B sector */
+ u8 reserved[464]; /* 512 */
+};
+
+/*
+ * Chunk mapping entry: entries are indexed by chunk number
+ * and give the zone ID (dzone_id) mapping the chunk on disk.
+ * This zone may be sequential or random. If it is a sequential
+ * zone, a second zone (bzone_id) used as a write buffer may
+ * also be specified. This second zone will always be a randomly
+ * writeable zone.
+ */
+struct dmz_map {
+ __le32 dzone_id;
+ __le32 bzone_id;
+};
+
+/*
+ * Chunk mapping table metadata: 512 8-bytes entries per 4KB block.
+ */
+#define DMZ_MAP_ENTRIES (DMZ_BLOCK_SIZE / sizeof(struct dmz_map))
+#define DMZ_MAP_ENTRIES_SHIFT (ilog2(DMZ_MAP_ENTRIES))
+#define DMZ_MAP_ENTRIES_MASK (DMZ_MAP_ENTRIES - 1)
+#define DMZ_MAP_UNMAPPED UINT_MAX
+
+/*
+ * Meta data block descriptor (for cached metadata blocks).
+ */
+struct dmz_mblock {
+ struct rb_node node;
+ struct list_head link;
+ sector_t no;
+ atomic_t ref;
+ unsigned long state;
+ struct page *page;
+ void *data;
+};
+
+/*
+ * Metadata block state flags.
+ */
+enum {
+ DMZ_META_DIRTY,
+ DMZ_META_READING,
+ DMZ_META_WRITING,
+ DMZ_META_ERROR,
+};
+
+/*
+ * Super block information (one per metadata set).
+ */
+struct dmz_sb {
+ sector_t block;
+ struct dmz_mblock *mblk;
+ struct dmz_super *sb;
+};
+
+/*
+ * In-memory metadata.
+ */
+struct dmz_metadata {
+ struct dmz_dev *dev;
+
+ sector_t zone_bitmap_size;
+ unsigned int zone_nr_bitmap_blocks;
+
+ unsigned int nr_bitmap_blocks;
+ unsigned int nr_map_blocks;
+
+ unsigned int nr_useable_zones;
+ unsigned int nr_meta_blocks;
+ unsigned int nr_meta_zones;
+ unsigned int nr_data_zones;
+ unsigned int nr_rnd_zones;
+ unsigned int nr_reserved_seq;
+ unsigned int nr_chunks;
+
+ /* Zone information array */
+ struct dm_zone *zones;
+
+ struct dm_zone *sb_zone;
+ struct dmz_sb sb[2];
+ unsigned int mblk_primary;
+ u64 sb_gen;
+ unsigned int min_nr_mblks;
+ unsigned int max_nr_mblks;
+ atomic_t nr_mblks;
+ struct rw_semaphore mblk_sem;
+ struct mutex mblk_flush_lock;
+ spinlock_t mblk_lock;
+ struct rb_root mblk_rbtree;
+ struct list_head mblk_lru_list;
+ struct list_head mblk_dirty_list;
+ struct shrinker mblk_shrinker;
+
+ /* Zone allocation management */
+ struct mutex map_lock;
+ struct dmz_mblock **map_mblk;
+ unsigned int nr_rnd;
+ atomic_t unmap_nr_rnd;
+ struct list_head unmap_rnd_list;
+ struct list_head map_rnd_list;
+
+ unsigned int nr_seq;
+ atomic_t unmap_nr_seq;
+ struct list_head unmap_seq_list;
+ struct list_head map_seq_list;
+
+ atomic_t nr_reserved_seq_zones;
+ struct list_head reserved_seq_zones_list;
+
+ wait_queue_head_t free_wq;
+};
+
+/*
+ * Various accessors
+ */
+unsigned int dmz_id(struct dmz_metadata *zmd, struct dm_zone *zone)
+{
+ return ((unsigned int)(zone - zmd->zones));
+}
+
+sector_t dmz_start_sect(struct dmz_metadata *zmd, struct dm_zone *zone)
+{
+ return (sector_t)dmz_id(zmd, zone) << zmd->dev->zone_nr_sectors_shift;
+}
+
+sector_t dmz_start_block(struct dmz_metadata *zmd, struct dm_zone *zone)
+{
+ return (sector_t)dmz_id(zmd, zone) << zmd->dev->zone_nr_blocks_shift;
+}
+
+unsigned int dmz_nr_chunks(struct dmz_metadata *zmd)
+{
+ return zmd->nr_chunks;
+}
+
+unsigned int dmz_nr_rnd_zones(struct dmz_metadata *zmd)
+{
+ return zmd->nr_rnd;
+}
+
+unsigned int dmz_nr_unmap_rnd_zones(struct dmz_metadata *zmd)
+{
+ return atomic_read(&zmd->unmap_nr_rnd);
+}
+
+/*
+ * Lock/unlock mapping table.
+ * The map lock also protects all the zone lists.
+ */
+void dmz_lock_map(struct dmz_metadata *zmd)
+{
+ mutex_lock(&zmd->map_lock);
+}
+
+void dmz_unlock_map(struct dmz_metadata *zmd)
+{
+ mutex_unlock(&zmd->map_lock);
+}
+
+/*
+ * Lock/unlock metadata access. This is a "read" lock on a semaphore
+ * that prevents metadata flush from running while metadata are being
+ * modified. The actual metadata write mutual exclusion is achieved with
+ * the map lock and zone styate management (active and reclaim state are
+ * mutually exclusive).
+ */
+void dmz_lock_metadata(struct dmz_metadata *zmd)
+{
+ down_read(&zmd->mblk_sem);
+}
+
+void dmz_unlock_metadata(struct dmz_metadata *zmd)
+{
+ up_read(&zmd->mblk_sem);
+}
+
+/*
+ * Lock/unlock flush: prevent concurrent executions
+ * of dmz_flush_metadata as well as metadata modification in reclaim
+ * while flush is being executed.
+ */
+void dmz_lock_flush(struct dmz_metadata *zmd)
+{
+ mutex_lock(&zmd->mblk_flush_lock);
+}
+
+void dmz_unlock_flush(struct dmz_metadata *zmd)
+{
+ mutex_unlock(&zmd->mblk_flush_lock);
+}
+
+/*
+ * Allocate a metadata block.
+ */
+static struct dmz_mblock *dmz_alloc_mblock(struct dmz_metadata *zmd,
+ sector_t mblk_no)
+{
+ struct dmz_mblock *mblk = NULL;
+
+ /* See if we can reuse cached blocks */
+ if (zmd->max_nr_mblks && atomic_read(&zmd->nr_mblks) > zmd->max_nr_mblks) {
+ spin_lock(&zmd->mblk_lock);
+ mblk = list_first_entry_or_null(&zmd->mblk_lru_list,
+ struct dmz_mblock, link);
+ if (mblk) {
+ list_del_init(&mblk->link);
+ rb_erase(&mblk->node, &zmd->mblk_rbtree);
+ mblk->no = mblk_no;
+ }
+ spin_unlock(&zmd->mblk_lock);
+ if (mblk)
+ return mblk;
+ }
+
+ /* Allocate a new block */
+ mblk = kmalloc(sizeof(struct dmz_mblock), GFP_NOIO);
+ if (!mblk)
+ return NULL;
+
+ mblk->page = alloc_page(GFP_NOIO);
+ if (!mblk->page) {
+ kfree(mblk);
+ return NULL;
+ }
+
+ RB_CLEAR_NODE(&mblk->node);
+ INIT_LIST_HEAD(&mblk->link);
+ atomic_set(&mblk->ref, 0);
+ mblk->state = 0;
+ mblk->no = mblk_no;
+ mblk->data = page_address(mblk->page);
+
+ atomic_inc(&zmd->nr_mblks);
+
+ return mblk;
+}
+
+/*
+ * Free a metadata block.
+ */
+static void dmz_free_mblock(struct dmz_metadata *zmd, struct dmz_mblock *mblk)
+{
+ __free_pages(mblk->page, 0);
+ kfree(mblk);
+
+ atomic_dec(&zmd->nr_mblks);
+}
+
+/*
+ * Insert a metadata block in the rbtree.
+ */
+static void dmz_insert_mblock(struct dmz_metadata *zmd, struct dmz_mblock *mblk)
+{
+ struct rb_root *root = &zmd->mblk_rbtree;
+ struct rb_node **new = &(root->rb_node), *parent = NULL;
+ struct dmz_mblock *b;
+
+ /* Figure out where to put the new node */
+ while (*new) {
+ b = container_of(*new, struct dmz_mblock, node);
+ parent = *new;
+ new = (b->no < mblk->no) ? &((*new)->rb_left) : &((*new)->rb_right);
+ }
+
+ /* Add new node and rebalance tree */
+ rb_link_node(&mblk->node, parent, new);
+ rb_insert_color(&mblk->node, root);
+}
+
+/*
+ * Lookup a metadata block in the rbtree.
+ */
+static struct dmz_mblock *dmz_lookup_mblock(struct dmz_metadata *zmd,
+ sector_t mblk_no)
+{
+ struct rb_root *root = &zmd->mblk_rbtree;
+ struct rb_node *node = root->rb_node;
+ struct dmz_mblock *mblk;
+
+ while (node) {
+ mblk = container_of(node, struct dmz_mblock, node);
+ if (mblk->no == mblk_no)
+ return mblk;
+ node = (mblk->no < mblk_no) ? node->rb_left : node->rb_right;
+ }
+
+ return NULL;
+}
+
+/*
+ * Metadata block BIO end callback.
+ */
+static void dmz_mblock_bio_end_io(struct bio *bio)
+{
+ struct dmz_mblock *mblk = bio->bi_private;
+ int flag;
+
+ if (bio->bi_status)
+ set_bit(DMZ_META_ERROR, &mblk->state);
+
+ if (bio_op(bio) == REQ_OP_WRITE)
+ flag = DMZ_META_WRITING;
+ else
+ flag = DMZ_META_READING;
+
+ clear_bit_unlock(flag, &mblk->state);
+ smp_mb__after_atomic();
+ wake_up_bit(&mblk->state, flag);
+
+ bio_put(bio);
+}
+
+/*
+ * Read a metadata block from disk.
+ */
+static struct dmz_mblock *dmz_fetch_mblock(struct dmz_metadata *zmd,
+ sector_t mblk_no)
+{
+ struct dmz_mblock *mblk;
+ sector_t block = zmd->sb[zmd->mblk_primary].block + mblk_no;
+ struct bio *bio;
+
+ /* Get block and insert it */
+ mblk = dmz_alloc_mblock(zmd, mblk_no);
+ if (!mblk)
+ return NULL;
+
+ spin_lock(&zmd->mblk_lock);
+ atomic_inc(&mblk->ref);
+ set_bit(DMZ_META_READING, &mblk->state);
+ dmz_insert_mblock(zmd, mblk);
+ spin_unlock(&zmd->mblk_lock);
+
+ bio = bio_alloc(GFP_NOIO, 1);
+ if (!bio) {
+ dmz_free_mblock(zmd, mblk);
+ return NULL;
+ }
+
+ bio->bi_iter.bi_sector = dmz_blk2sect(block);
+ bio->bi_bdev = zmd->dev->bdev;
+ bio->bi_private = mblk;
+ bio->bi_end_io = dmz_mblock_bio_end_io;
+ bio_set_op_attrs(bio, REQ_OP_READ, REQ_META | REQ_PRIO);
+ bio_add_page(bio, mblk->page, DMZ_BLOCK_SIZE, 0);
+ submit_bio(bio);
+
+ return mblk;
+}
+
+/*
+ * Free metadata blocks.
+ */
+static unsigned long dmz_shrink_mblock_cache(struct dmz_metadata *zmd,
+ unsigned long limit)
+{
+ struct dmz_mblock *mblk;
+ unsigned long count = 0;
+
+ if (!zmd->max_nr_mblks)
+ return 0;
+
+ while (!list_empty(&zmd->mblk_lru_list) &&
+ atomic_read(&zmd->nr_mblks) > zmd->min_nr_mblks &&
+ count < limit) {
+ mblk = list_first_entry(&zmd->mblk_lru_list,
+ struct dmz_mblock, link);
+ list_del_init(&mblk->link);
+ rb_erase(&mblk->node, &zmd->mblk_rbtree);
+ dmz_free_mblock(zmd, mblk);
+ count++;
+ }
+
+ return count;
+}
+
+/*
+ * For mblock shrinker: get the number of unused metadata blocks in the cache.
+ */
+static unsigned long dmz_mblock_shrinker_count(struct shrinker *shrink,
+ struct shrink_control *sc)
+{
+ struct dmz_metadata *zmd = container_of(shrink, struct dmz_metadata, mblk_shrinker);
+
+ return atomic_read(&zmd->nr_mblks);
+}
+
+/*
+ * For mblock shrinker: scan unused metadata blocks and shrink the cache.
+ */
+static unsigned long dmz_mblock_shrinker_scan(struct shrinker *shrink,
+ struct shrink_control *sc)
+{
+ struct dmz_metadata *zmd = container_of(shrink, struct dmz_metadata, mblk_shrinker);
+ unsigned long count;
+
+ spin_lock(&zmd->mblk_lock);
+ count = dmz_shrink_mblock_cache(zmd, sc->nr_to_scan);
+ spin_unlock(&zmd->mblk_lock);
+
+ return count ? count : SHRINK_STOP;
+}
+
+/*
+ * Release a metadata block.
+ */
+static void dmz_release_mblock(struct dmz_metadata *zmd,
+ struct dmz_mblock *mblk)
+{
+
+ if (!mblk)
+ return;
+
+ spin_lock(&zmd->mblk_lock);
+
+ if (atomic_dec_and_test(&mblk->ref)) {
+ if (test_bit(DMZ_META_ERROR, &mblk->state)) {
+ rb_erase(&mblk->node, &zmd->mblk_rbtree);
+ dmz_free_mblock(zmd, mblk);
+ } else if (!test_bit(DMZ_META_DIRTY, &mblk->state)) {
+ list_add_tail(&mblk->link, &zmd->mblk_lru_list);
+ dmz_shrink_mblock_cache(zmd, 1);
+ }
+ }
+
+ spin_unlock(&zmd->mblk_lock);
+}
+
+/*
+ * Get a metadata block from the rbtree. If the block
+ * is not present, read it from disk.
+ */
+static struct dmz_mblock *dmz_get_mblock(struct dmz_metadata *zmd,
+ sector_t mblk_no)
+{
+ struct dmz_mblock *mblk;
+
+ /* Check rbtree */
+ spin_lock(&zmd->mblk_lock);
+ mblk = dmz_lookup_mblock(zmd, mblk_no);
+ if (mblk) {
+ /* Cache hit: remove block from LRU list */
+ if (atomic_inc_return(&mblk->ref) == 1 &&
+ !test_bit(DMZ_META_DIRTY, &mblk->state))
+ list_del_init(&mblk->link);
+ }
+ spin_unlock(&zmd->mblk_lock);
+
+ if (!mblk) {
+ /* Cache miss: read the block from disk */
+ mblk = dmz_fetch_mblock(zmd, mblk_no);
+ if (!mblk)
+ return ERR_PTR(-ENOMEM);
+ }
+
+ /* Wait for on-going read I/O and check for error */
+ wait_on_bit_io(&mblk->state, DMZ_META_READING,
+ TASK_UNINTERRUPTIBLE);
+ if (test_bit(DMZ_META_ERROR, &mblk->state)) {
+ dmz_release_mblock(zmd, mblk);
+ return ERR_PTR(-EIO);
+ }
+
+ return mblk;
+}
+
+/*
+ * Mark a metadata block dirty.
+ */
+static void dmz_dirty_mblock(struct dmz_metadata *zmd, struct dmz_mblock *mblk)
+{
+ spin_lock(&zmd->mblk_lock);
+ if (!test_and_set_bit(DMZ_META_DIRTY, &mblk->state))
+ list_add_tail(&mblk->link, &zmd->mblk_dirty_list);
+ spin_unlock(&zmd->mblk_lock);
+}
+
+/*
+ * Issue a metadata block write BIO.
+ */
+static void dmz_write_mblock(struct dmz_metadata *zmd, struct dmz_mblock *mblk,
+ unsigned int set)
+{
+ sector_t block = zmd->sb[set].block + mblk->no;
+ struct bio *bio;
+
+ bio = bio_alloc(GFP_NOIO, 1);
+ if (!bio) {
+ set_bit(DMZ_META_ERROR, &mblk->state);
+ return;
+ }
+
+ set_bit(DMZ_META_WRITING, &mblk->state);
+
+ bio->bi_iter.bi_sector = dmz_blk2sect(block);
+ bio->bi_bdev = zmd->dev->bdev;
+ bio->bi_private = mblk;
+ bio->bi_end_io = dmz_mblock_bio_end_io;
+ bio_set_op_attrs(bio, REQ_OP_WRITE, REQ_META | REQ_PRIO);
+ bio_add_page(bio, mblk->page, DMZ_BLOCK_SIZE, 0);
+ submit_bio(bio);
+}
+
+/*
+ * Read/write a metadata block.
+ */
+static int dmz_rdwr_block(struct dmz_metadata *zmd, int op, sector_t block,
+ struct page *page)
+{
+ struct bio *bio;
+ int ret;
+
+ bio = bio_alloc(GFP_NOIO, 1);
+ if (!bio)
+ return -ENOMEM;
+
+ bio->bi_iter.bi_sector = dmz_blk2sect(block);
+ bio->bi_bdev = zmd->dev->bdev;
+ bio_set_op_attrs(bio, op, REQ_SYNC | REQ_META | REQ_PRIO);
+ bio_add_page(bio, page, DMZ_BLOCK_SIZE, 0);
+ ret = submit_bio_wait(bio);
+ bio_put(bio);
+
+ return ret;
+}
+
+/*
+ * Write super block of the specified metadata set.
+ */
+static int dmz_write_sb(struct dmz_metadata *zmd, unsigned int set)
+{
+ sector_t block = zmd->sb[set].block;
+ struct dmz_mblock *mblk = zmd->sb[set].mblk;
+ struct dmz_super *sb = zmd->sb[set].sb;
+ u64 sb_gen = zmd->sb_gen + 1;
+ int ret;
+
+ sb->magic = cpu_to_le32(DMZ_MAGIC);
+ sb->version = cpu_to_le32(DMZ_META_VER);
+
+ sb->gen = cpu_to_le64(sb_gen);
+
+ sb->sb_block = cpu_to_le64(block);
+ sb->nr_meta_blocks = cpu_to_le32(zmd->nr_meta_blocks);
+ sb->nr_reserved_seq = cpu_to_le32(zmd->nr_reserved_seq);
+ sb->nr_chunks = cpu_to_le32(zmd->nr_chunks);
+
+ sb->nr_map_blocks = cpu_to_le32(zmd->nr_map_blocks);
+ sb->nr_bitmap_blocks = cpu_to_le32(zmd->nr_bitmap_blocks);
+
+ sb->crc = 0;
+ sb->crc = cpu_to_le32(crc32_le(sb_gen, (unsigned char *)sb, DMZ_BLOCK_SIZE));
+
+ ret = dmz_rdwr_block(zmd, REQ_OP_WRITE, block, mblk->page);
+ if (ret == 0)
+ ret = blkdev_issue_flush(zmd->dev->bdev, GFP_KERNEL, NULL);
+
+ return ret;
+}
+
+/*
+ * Write dirty metadata blocks to the specified set.
+ */
+static int dmz_write_dirty_mblocks(struct dmz_metadata *zmd,
+ struct list_head *write_list,
+ unsigned int set)
+{
+ struct dmz_mblock *mblk;
+ struct blk_plug plug;
+ int ret = 0;
+
+ /* Issue writes */
+ blk_start_plug(&plug);
+ list_for_each_entry(mblk, write_list, link)
+ dmz_write_mblock(zmd, mblk, set);
+ blk_finish_plug(&plug);
+
+ /* Wait for completion */
+ list_for_each_entry(mblk, write_list, link) {
+ wait_on_bit_io(&mblk->state, DMZ_META_WRITING,
+ TASK_UNINTERRUPTIBLE);
+ if (test_bit(DMZ_META_ERROR, &mblk->state)) {
+ clear_bit(DMZ_META_ERROR, &mblk->state);
+ ret = -EIO;
+ }
+ }
+
+ /* Flush drive cache (this will also sync data) */
+ if (ret == 0)
+ ret = blkdev_issue_flush(zmd->dev->bdev, GFP_KERNEL, NULL);
+
+ return ret;
+}
+
+/*
+ * Log dirty metadata blocks.
+ */
+static int dmz_log_dirty_mblocks(struct dmz_metadata *zmd,
+ struct list_head *write_list)
+{
+ unsigned int log_set = zmd->mblk_primary ^ 0x1;
+ int ret;
+
+ /* Write dirty blocks to the log */
+ ret = dmz_write_dirty_mblocks(zmd, write_list, log_set);
+ if (ret)
+ return ret;
+
+ /*
+ * No error so far: now validate the log by updating the
+ * log index super block generation.
+ */
+ ret = dmz_write_sb(zmd, log_set);
+ if (ret)
+ return ret;
+
+ return 0;
+}
+
+/*
+ * Flush dirty metadata blocks.
+ */
+int dmz_flush_metadata(struct dmz_metadata *zmd)
+{
+ struct dmz_mblock *mblk;
+ struct list_head write_list;
+ int ret;
+
+ if (WARN_ON(!zmd))
+ return 0;
+
+ INIT_LIST_HEAD(&write_list);
+
+ /*
+ * Make sure that metadata blocks are stable before logging: take
+ * the write lock on the metadata semaphore to prevent target BIOs
+ * from modifying metadata.
+ */
+ down_write(&zmd->mblk_sem);
+
+ /*
+ * This is called from the target flush work and reclaim work.
+ * Concurrent execution is not allowed.
+ */
+ dmz_lock_flush(zmd);
+
+ /* Get dirty blocks */
+ spin_lock(&zmd->mblk_lock);
+ list_splice_init(&zmd->mblk_dirty_list, &write_list);
+ spin_unlock(&zmd->mblk_lock);
+
+ /* If there are no dirty metadata blocks, just flush the device cache */
+ if (list_empty(&write_list)) {
+ ret = blkdev_issue_flush(zmd->dev->bdev, GFP_KERNEL, NULL);
+ goto out;
+ }
+
+ /*
+ * The primary metadata set is still clean. Keep it this way until
+ * all updates are successful in the secondary set. That is, use
+ * the secondary set as a log.
+ */
+ ret = dmz_log_dirty_mblocks(zmd, &write_list);
+ if (ret)
+ goto out;
+
+ /*
+ * The log is on disk. It is now safe to update in place
+ * in the primary metadata set.
+ */
+ ret = dmz_write_dirty_mblocks(zmd, &write_list, zmd->mblk_primary);
+ if (ret)
+ goto out;
+
+ ret = dmz_write_sb(zmd, zmd->mblk_primary);
+ if (ret)
+ goto out;
+
+ while (!list_empty(&write_list)) {
+ mblk = list_first_entry(&write_list, struct dmz_mblock, link);
+ list_del_init(&mblk->link);
+
+ spin_lock(&zmd->mblk_lock);
+ clear_bit(DMZ_META_DIRTY, &mblk->state);
+ if (atomic_read(&mblk->ref) == 0)
+ list_add_tail(&mblk->link, &zmd->mblk_lru_list);
+ spin_unlock(&zmd->mblk_lock);
+ }
+
+ zmd->sb_gen++;
+out:
+ if (ret && !list_empty(&write_list)) {
+ spin_lock(&zmd->mblk_lock);
+ list_splice(&write_list, &zmd->mblk_dirty_list);
+ spin_unlock(&zmd->mblk_lock);
+ }
+
+ dmz_unlock_flush(zmd);
+ up_write(&zmd->mblk_sem);
+
+ return ret;
+}
+
+/*
+ * Check super block.
+ */
+static int dmz_check_sb(struct dmz_metadata *zmd, struct dmz_super *sb)
+{
+ unsigned int nr_meta_zones, nr_data_zones;
+ struct dmz_dev *dev = zmd->dev;
+ u32 crc, stored_crc;
+ u64 gen;
+
+ gen = le64_to_cpu(sb->gen);
+ stored_crc = le32_to_cpu(sb->crc);
+ sb->crc = 0;
+ crc = crc32_le(gen, (unsigned char *)sb, DMZ_BLOCK_SIZE);
+ if (crc != stored_crc) {
+ dmz_dev_err(dev, "Invalid checksum (needed 0x%08x, got 0x%08x)",
+ crc, stored_crc);
+ return -ENXIO;
+ }
+
+ if (le32_to_cpu(sb->magic) != DMZ_MAGIC) {
+ dmz_dev_err(dev, "Invalid meta magic (needed 0x%08x, got 0x%08x)",
+ DMZ_MAGIC, le32_to_cpu(sb->magic));
+ return -ENXIO;
+ }
+
+ if (le32_to_cpu(sb->version) != DMZ_META_VER) {
+ dmz_dev_err(dev, "Invalid meta version (needed %d, got %d)",
+ DMZ_META_VER, le32_to_cpu(sb->version));
+ return -ENXIO;
+ }
+
+ nr_meta_zones = (le32_to_cpu(sb->nr_meta_blocks) + dev->zone_nr_blocks - 1)
+ >> dev->zone_nr_blocks_shift;
+ if (!nr_meta_zones ||
+ nr_meta_zones >= zmd->nr_rnd_zones) {
+ dmz_dev_err(dev, "Invalid number of metadata blocks");
+ return -ENXIO;
+ }
+
+ if (!le32_to_cpu(sb->nr_reserved_seq) ||
+ le32_to_cpu(sb->nr_reserved_seq) >= (zmd->nr_useable_zones - nr_meta_zones)) {
+ dmz_dev_err(dev, "Invalid number of reserved sequential zones");
+ return -ENXIO;
+ }
+
+ nr_data_zones = zmd->nr_useable_zones -
+ (nr_meta_zones * 2 + le32_to_cpu(sb->nr_reserved_seq));
+ if (le32_to_cpu(sb->nr_chunks) > nr_data_zones) {
+ dmz_dev_err(dev, "Invalid number of chunks %u / %u",
+ le32_to_cpu(sb->nr_chunks), nr_data_zones);
+ return -ENXIO;
+ }
+
+ /* OK */
+ zmd->nr_meta_blocks = le32_to_cpu(sb->nr_meta_blocks);
+ zmd->nr_reserved_seq = le32_to_cpu(sb->nr_reserved_seq);
+ zmd->nr_chunks = le32_to_cpu(sb->nr_chunks);
+ zmd->nr_map_blocks = le32_to_cpu(sb->nr_map_blocks);
+ zmd->nr_bitmap_blocks = le32_to_cpu(sb->nr_bitmap_blocks);
+ zmd->nr_meta_zones = nr_meta_zones;
+ zmd->nr_data_zones = nr_data_zones;
+
+ return 0;
+}
+
+/*
+ * Read the first or second super block from disk.
+ */
+static int dmz_read_sb(struct dmz_metadata *zmd, unsigned int set)
+{
+ return dmz_rdwr_block(zmd, REQ_OP_READ, zmd->sb[set].block,
+ zmd->sb[set].mblk->page);
+}
+
+/*
+ * Determine the position of the secondary super blocks on disk.
+ * This is used only if a corruption of the primary super block
+ * is detected.
+ */
+static int dmz_lookup_secondary_sb(struct dmz_metadata *zmd)
+{
+ unsigned int zone_nr_blocks = zmd->dev->zone_nr_blocks;
+ struct dmz_mblock *mblk;
+ int i;
+
+ /* Allocate a block */
+ mblk = dmz_alloc_mblock(zmd, 0);
+ if (!mblk)
+ return -ENOMEM;
+
+ zmd->sb[1].mblk = mblk;
+ zmd->sb[1].sb = mblk->data;
+
+ /* Bad first super block: search for the second one */
+ zmd->sb[1].block = zmd->sb[0].block + zone_nr_blocks;
+ for (i = 0; i < zmd->nr_rnd_zones - 1; i++) {
+ if (dmz_read_sb(zmd, 1) != 0)
+ break;
+ if (le32_to_cpu(zmd->sb[1].sb->magic) == DMZ_MAGIC)
+ return 0;
+ zmd->sb[1].block += zone_nr_blocks;
+ }
+
+ dmz_free_mblock(zmd, mblk);
+ zmd->sb[1].mblk = NULL;
+
+ return -EIO;
+}
+
+/*
+ * Read the first or second super block from disk.
+ */
+static int dmz_get_sb(struct dmz_metadata *zmd, unsigned int set)
+{
+ struct dmz_mblock *mblk;
+ int ret;
+
+ /* Allocate a block */
+ mblk = dmz_alloc_mblock(zmd, 0);
+ if (!mblk)
+ return -ENOMEM;
+
+ zmd->sb[set].mblk = mblk;
+ zmd->sb[set].sb = mblk->data;
+
+ /* Read super block */
+ ret = dmz_read_sb(zmd, set);
+ if (ret) {
+ dmz_free_mblock(zmd, mblk);
+ zmd->sb[set].mblk = NULL;
+ return ret;
+ }
+
+ return 0;
+}
+
+/*
+ * Recover a metadata set.
+ */
+static int dmz_recover_mblocks(struct dmz_metadata *zmd, unsigned int dst_set)
+{
+ unsigned int src_set = dst_set ^ 0x1;
+ struct page *page;
+ int i, ret;
+
+ dmz_dev_warn(zmd->dev, "Metadata set %u invalid: recovering", dst_set);
+
+ if (dst_set == 0)
+ zmd->sb[0].block = dmz_start_block(zmd, zmd->sb_zone);
+ else {
+ zmd->sb[1].block = zmd->sb[0].block +
+ (zmd->nr_meta_zones << zmd->dev->zone_nr_blocks_shift);
+ }
+
+ page = alloc_page(GFP_KERNEL);
+ if (!page)
+ return -ENOMEM;
+
+ /* Copy metadata blocks */
+ for (i = 1; i < zmd->nr_meta_blocks; i++) {
+ ret = dmz_rdwr_block(zmd, REQ_OP_READ,
+ zmd->sb[src_set].block + i, page);
+ if (ret)
+ goto out;
+ ret = dmz_rdwr_block(zmd, REQ_OP_WRITE,
+ zmd->sb[dst_set].block + i, page);
+ if (ret)
+ goto out;
+ }
+
+ /* Finalize with the super block */
+ if (!zmd->sb[dst_set].mblk) {
+ zmd->sb[dst_set].mblk = dmz_alloc_mblock(zmd, 0);
+ if (!zmd->sb[dst_set].mblk) {
+ ret = -ENOMEM;
+ goto out;
+ }
+ zmd->sb[dst_set].sb = zmd->sb[dst_set].mblk->data;
+ }
+
+ ret = dmz_write_sb(zmd, dst_set);
+out:
+ __free_pages(page, 0);
+
+ return ret;
+}
+
+/*
+ * Get super block from disk.
+ */
+static int dmz_load_sb(struct dmz_metadata *zmd)
+{
+ bool sb_good[2] = {false, false};
+ u64 sb_gen[2] = {0, 0};
+ int ret;
+
+ /* Read and check the primary super block */
+ zmd->sb[0].block = dmz_start_block(zmd, zmd->sb_zone);
+ ret = dmz_get_sb(zmd, 0);
+ if (ret) {
+ dmz_dev_err(zmd->dev, "Read primary super block failed");
+ return ret;
+ }
+
+ ret = dmz_check_sb(zmd, zmd->sb[0].sb);
+
+ /* Read and check secondary super block */
+ if (ret == 0) {
+ sb_good[0] = true;
+ zmd->sb[1].block = zmd->sb[0].block +
+ (zmd->nr_meta_zones << zmd->dev->zone_nr_blocks_shift);
+ ret = dmz_get_sb(zmd, 1);
+ } else
+ ret = dmz_lookup_secondary_sb(zmd);
+
+ if (ret) {
+ dmz_dev_err(zmd->dev, "Read secondary super block failed");
+ return ret;
+ }
+
+ ret = dmz_check_sb(zmd, zmd->sb[1].sb);
+ if (ret == 0)
+ sb_good[1] = true;
+
+ /* Use highest generation sb first */
+ if (!sb_good[0] && !sb_good[1]) {
+ dmz_dev_err(zmd->dev, "No valid super block found");
+ return -EIO;
+ }
+
+ if (sb_good[0])
+ sb_gen[0] = le64_to_cpu(zmd->sb[0].sb->gen);
+ else
+ ret = dmz_recover_mblocks(zmd, 0);
+
+ if (sb_good[1])
+ sb_gen[1] = le64_to_cpu(zmd->sb[1].sb->gen);
+ else
+ ret = dmz_recover_mblocks(zmd, 1);
+
+ if (ret) {
+ dmz_dev_err(zmd->dev, "Recovery failed");
+ return -EIO;
+ }
+
+ if (sb_gen[0] >= sb_gen[1]) {
+ zmd->sb_gen = sb_gen[0];
+ zmd->mblk_primary = 0;
+ } else {
+ zmd->sb_gen = sb_gen[1];
+ zmd->mblk_primary = 1;
+ }
+
+ dmz_dev_debug(zmd->dev, "Using super block %u (gen %llu)",
+ zmd->mblk_primary, zmd->sb_gen);
+
+ return 0;
+}
+
+/*
+ * Initialize a zone descriptor.
+ */
+static int dmz_init_zone(struct dmz_metadata *zmd, struct dm_zone *zone,
+ struct blk_zone *blkz)
+{
+ struct dmz_dev *dev = zmd->dev;
+
+ /* Ignore the eventual last runt (smaller) zone */
+ if (blkz->len != dev->zone_nr_sectors) {
+ if (blkz->start + blkz->len == dev->capacity)
+ return 0;
+ return -ENXIO;
+ }
+
+ INIT_LIST_HEAD(&zone->link);
+ atomic_set(&zone->refcount, 0);
+ zone->chunk = DMZ_MAP_UNMAPPED;
+
+ if (blkz->type == BLK_ZONE_TYPE_CONVENTIONAL) {
+ set_bit(DMZ_RND, &zone->flags);
+ zmd->nr_rnd_zones++;
+ } else if (blkz->type == BLK_ZONE_TYPE_SEQWRITE_REQ ||
+ blkz->type == BLK_ZONE_TYPE_SEQWRITE_PREF) {
+ set_bit(DMZ_SEQ, &zone->flags);
+ } else
+ return -ENXIO;
+
+ if (blkz->cond == BLK_ZONE_COND_OFFLINE)
+ set_bit(DMZ_OFFLINE, &zone->flags);
+ else if (blkz->cond == BLK_ZONE_COND_READONLY)
+ set_bit(DMZ_READ_ONLY, &zone->flags);
+
+ if (dmz_is_rnd(zone))
+ zone->wp_block = 0;
+ else
+ zone->wp_block = dmz_sect2blk(blkz->wp - blkz->start);
+
+ if (!dmz_is_offline(zone) && !dmz_is_readonly(zone)) {
+ zmd->nr_useable_zones++;
+ if (dmz_is_rnd(zone)) {
+ zmd->nr_rnd_zones++;
+ if (!zmd->sb_zone) {
+ /* Super block zone */
+ zmd->sb_zone = zone;
+ }
+ }
+ }
+
+ return 0;
+}
+
+/*
+ * Free zones descriptors.
+ */
+static void dmz_drop_zones(struct dmz_metadata *zmd)
+{
+ kfree(zmd->zones);
+ zmd->zones = NULL;
+}
+
+/*
+ * The size of a zone report in number of zones.
+ * This results in 4096*64B=256KB report zones commands.
+ */
+#define DMZ_REPORT_NR_ZONES 4096
+
+/*
+ * Allocate and initialize zone descriptors using the zone
+ * information from disk.
+ */
+static int dmz_init_zones(struct dmz_metadata *zmd)
+{
+ struct dmz_dev *dev = zmd->dev;
+ struct dm_zone *zone;
+ struct blk_zone *blkz;
+ unsigned int nr_blkz;
+ sector_t sector = 0;
+ int i, ret = 0;
+
+ /* Init */
+ zmd->zone_bitmap_size = dev->zone_nr_blocks >> 3;
+ zmd->zone_nr_bitmap_blocks = zmd->zone_bitmap_size >> DMZ_BLOCK_SHIFT;
+
+ /* Allocate zone array */
+ zmd->zones = kcalloc(dev->nr_zones, sizeof(struct dm_zone), GFP_KERNEL);
+ if (!zmd->zones)
+ return -ENOMEM;
+
+ dmz_dev_info(dev, "Using %zu B for zone information",
+ sizeof(struct dm_zone) * dev->nr_zones);
+
+ /* Get zone information */
+ nr_blkz = DMZ_REPORT_NR_ZONES;
+ blkz = kcalloc(nr_blkz, sizeof(struct blk_zone), GFP_KERNEL);
+ if (!blkz) {
+ ret = -ENOMEM;
+ goto out;
+ }
+
+ /*
+ * Get zone information and initialize zone descriptors.
+ * At the same time, determine where the super block
+ * should be: first block of the first randomly writable
+ * zone.
+ */
+ zone = zmd->zones;
+ while (sector < dev->capacity) {
+ /* Get zone information */
+ nr_blkz = DMZ_REPORT_NR_ZONES;
+ ret = blkdev_report_zones(dev->bdev, sector, blkz,
+ &nr_blkz, GFP_KERNEL);
+ if (ret) {
+ dmz_dev_err(dev, "Report zones failed %d", ret);
+ goto out;
+ }
+
+ /* Process report */
+ for (i = 0; i < nr_blkz; i++) {
+ ret = dmz_init_zone(zmd, zone, &blkz[i]);
+ if (ret)
+ goto out;
+ sector += dev->zone_nr_sectors;
+ zone++;
+ }
+ }
+
+ /* The entire zone configuration of the disk should now be known */
+ if (sector < dev->capacity) {
+ dmz_dev_err(dev, "Failed to get correct zone information");
+ ret = -ENXIO;
+ }
+out:
+ kfree(blkz);
+ if (ret)
+ dmz_drop_zones(zmd);
+
+ return ret;
+}
+
+/*
+ * Update a zone information.
+ */
+static int dmz_update_zone(struct dmz_metadata *zmd, struct dm_zone *zone)
+{
+ unsigned int nr_blkz = 1;
+ struct blk_zone blkz;
+ int ret;
+
+ /* Get zone information from disk */
+ ret = blkdev_report_zones(zmd->dev->bdev, dmz_start_sect(zmd, zone),
+ &blkz, &nr_blkz, GFP_KERNEL);
+ if (ret) {
+ dmz_dev_err(zmd->dev, "Get zone %u report failed",
+ dmz_id(zmd, zone));
+ return ret;
+ }
+
+ clear_bit(DMZ_OFFLINE, &zone->flags);
+ clear_bit(DMZ_READ_ONLY, &zone->flags);
+ if (blkz.cond == BLK_ZONE_COND_OFFLINE)
+ set_bit(DMZ_OFFLINE, &zone->flags);
+ else if (blkz.cond == BLK_ZONE_COND_READONLY)
+ set_bit(DMZ_READ_ONLY, &zone->flags);
+
+ if (dmz_is_seq(zone))
+ zone->wp_block = dmz_sect2blk(blkz.wp - blkz.start);
+ else
+ zone->wp_block = 0;
+
+ return 0;
+}
+
+/*
+ * Check a zone write pointer position when the zone is marked
+ * with the sequential write error flag.
+ */
+static int dmz_handle_seq_write_err(struct dmz_metadata *zmd,
+ struct dm_zone *zone)
+{
+ unsigned int wp = 0;
+ int ret;
+
+ wp = zone->wp_block;
+ ret = dmz_update_zone(zmd, zone);
+ if (ret)
+ return ret;
+
+ dmz_dev_warn(zmd->dev, "Processing zone %u write error (zone wp %u/%u)",
+ dmz_id(zmd, zone), zone->wp_block, wp);
+
+ if (zone->wp_block < wp) {
+ dmz_invalidate_blocks(zmd, zone, zone->wp_block,
+ wp - zone->wp_block);
+ }
+
+ return 0;
+}
+
+static struct dm_zone *dmz_get(struct dmz_metadata *zmd, unsigned int zone_id)
+{
+ return &zmd->zones[zone_id];
+}
+
+/*
+ * Reset a zone write pointer.
+ */
+static int dmz_reset_zone(struct dmz_metadata *zmd, struct dm_zone *zone)
+{
+ int ret;
+
+ /*
+ * Ignore offline zones, read only zones,
+ * and conventional zones.
+ */
+ if (dmz_is_offline(zone) ||
+ dmz_is_readonly(zone) ||
+ dmz_is_rnd(zone))
+ return 0;
+
+ if (!dmz_is_empty(zone) || dmz_seq_write_err(zone)) {
+ struct dmz_dev *dev = zmd->dev;
+
+ ret = blkdev_reset_zones(dev->bdev,
+ dmz_start_sect(zmd, zone),
+ dev->zone_nr_sectors, GFP_KERNEL);
+ if (ret) {
+ dmz_dev_err(dev, "Reset zone %u failed %d",
+ dmz_id(zmd, zone), ret);
+ return ret;
+ }
+ }
+
+ /* Clear write error bit and rewind write pointer position */
+ clear_bit(DMZ_SEQ_WRITE_ERR, &zone->flags);
+ zone->wp_block = 0;
+
+ return 0;
+}
+
+static void dmz_get_zone_weight(struct dmz_metadata *zmd, struct dm_zone *zone);
+
+/*
+ * Initialize chunk mapping.
+ */
+static int dmz_load_mapping(struct dmz_metadata *zmd)
+{
+ struct dmz_dev *dev = zmd->dev;
+ struct dm_zone *dzone, *bzone;
+ struct dmz_mblock *dmap_mblk = NULL;
+ struct dmz_map *dmap;
+ unsigned int i = 0, e = 0, chunk = 0;
+ unsigned int dzone_id;
+ unsigned int bzone_id;
+
+ /* Metadata block array for the chunk mapping table */
+ zmd->map_mblk = kcalloc(zmd->nr_map_blocks,
+ sizeof(struct dmz_mblk *), GFP_KERNEL);
+ if (!zmd->map_mblk)
+ return -ENOMEM;
+
+ /* Get chunk mapping table blocks and initialize zone mapping */
+ while (chunk < zmd->nr_chunks) {
+ if (!dmap_mblk) {
+ /* Get mapping block */
+ dmap_mblk = dmz_get_mblock(zmd, i + 1);
+ if (IS_ERR(dmap_mblk))
+ return PTR_ERR(dmap_mblk);
+ zmd->map_mblk[i] = dmap_mblk;
+ dmap = (struct dmz_map *) dmap_mblk->data;
+ i++;
+ e = 0;
+ }
+
+ /* Check data zone */
+ dzone_id = le32_to_cpu(dmap[e].dzone_id);
+ if (dzone_id == DMZ_MAP_UNMAPPED)
+ goto next;
+
+ if (dzone_id >= dev->nr_zones) {
+ dmz_dev_err(dev, "Chunk %u mapping: invalid data zone ID %u",
+ chunk, dzone_id);
+ return -EIO;
+ }
+
+ dzone = dmz_get(zmd, dzone_id);
+ set_bit(DMZ_DATA, &dzone->flags);
+ dzone->chunk = chunk;
+ dmz_get_zone_weight(zmd, dzone);
+
+ if (dmz_is_rnd(dzone))
+ list_add_tail(&dzone->link, &zmd->map_rnd_list);
+ else
+ list_add_tail(&dzone->link, &zmd->map_seq_list);
+
+ /* Check buffer zone */
+ bzone_id = le32_to_cpu(dmap[e].bzone_id);
+ if (bzone_id == DMZ_MAP_UNMAPPED)
+ goto next;
+
+ if (bzone_id >= dev->nr_zones) {
+ dmz_dev_err(dev, "Chunk %u mapping: invalid buffer zone ID %u",
+ chunk, bzone_id);
+ return -EIO;
+ }
+
+ bzone = dmz_get(zmd, bzone_id);
+ if (!dmz_is_rnd(bzone)) {
+ dmz_dev_err(dev, "Chunk %u mapping: invalid buffer zone %u",
+ chunk, bzone_id);
+ return -EIO;
+ }
+
+ set_bit(DMZ_DATA, &bzone->flags);
+ set_bit(DMZ_BUF, &bzone->flags);
+ bzone->chunk = chunk;
+ bzone->bzone = dzone;
+ dzone->bzone = bzone;
+ dmz_get_zone_weight(zmd, bzone);
+ list_add_tail(&bzone->link, &zmd->map_rnd_list);
+next:
+ chunk++;
+ e++;
+ if (e >= DMZ_MAP_ENTRIES)
+ dmap_mblk = NULL;
+ }
+
+ /*
+ * At this point, only meta zones and mapped data zones were
+ * fully initialized. All remaining zones are unmapped data
+ * zones. Finish initializing those here.
+ */
+ for (i = 0; i < dev->nr_zones; i++) {
+ dzone = dmz_get(zmd, i);
+ if (dmz_is_meta(dzone))
+ continue;
+
+ if (dmz_is_rnd(dzone))
+ zmd->nr_rnd++;
+ else
+ zmd->nr_seq++;
+
+ if (dmz_is_data(dzone)) {
+ /* Already initialized */
+ continue;
+ }
+
+ /* Unmapped data zone */
+ set_bit(DMZ_DATA, &dzone->flags);
+ dzone->chunk = DMZ_MAP_UNMAPPED;
+ if (dmz_is_rnd(dzone)) {
+ list_add_tail(&dzone->link, &zmd->unmap_rnd_list);
+ atomic_inc(&zmd->unmap_nr_rnd);
+ } else if (atomic_read(&zmd->nr_reserved_seq_zones) < zmd->nr_reserved_seq) {
+ list_add_tail(&dzone->link, &zmd->reserved_seq_zones_list);
+ atomic_inc(&zmd->nr_reserved_seq_zones);
+ zmd->nr_seq--;
+ } else {
+ list_add_tail(&dzone->link, &zmd->unmap_seq_list);
+ atomic_inc(&zmd->unmap_nr_seq);
+ }
+ }
+
+ return 0;
+}
+
+/*
+ * Set a data chunk mapping.
+ */
+static void dmz_set_chunk_mapping(struct dmz_metadata *zmd, unsigned int chunk,
+ unsigned int dzone_id, unsigned int bzone_id)
+{
+ struct dmz_mblock *dmap_mblk = zmd->map_mblk[chunk >> DMZ_MAP_ENTRIES_SHIFT];
+ struct dmz_map *dmap = (struct dmz_map *) dmap_mblk->data;
+ int map_idx = chunk & DMZ_MAP_ENTRIES_MASK;
+
+ dmap[map_idx].dzone_id = cpu_to_le32(dzone_id);
+ dmap[map_idx].bzone_id = cpu_to_le32(bzone_id);
+ dmz_dirty_mblock(zmd, dmap_mblk);
+}
+
+/*
+ * The list of mapped zones is maintained in LRU order.
+ * This rotates a zone at the end of its map list.
+ */
+static void __dmz_lru_zone(struct dmz_metadata *zmd, struct dm_zone *zone)
+{
+ if (list_empty(&zone->link))
+ return;
+
+ list_del_init(&zone->link);
+ if (dmz_is_seq(zone)) {
+ /* LRU rotate sequential zone */
+ list_add_tail(&zone->link, &zmd->map_seq_list);
+ } else {
+ /* LRU rotate random zone */
+ list_add_tail(&zone->link, &zmd->map_rnd_list);
+ }
+}
+
+/*
+ * The list of mapped random zones is maintained
+ * in LRU order. This rotates a zone at the end of the list.
+ */
+static void dmz_lru_zone(struct dmz_metadata *zmd, struct dm_zone *zone)
+{
+ __dmz_lru_zone(zmd, zone);
+ if (zone->bzone)
+ __dmz_lru_zone(zmd, zone->bzone);
+}
+
+/*
+ * Wait for any zone to be freed.
+ */
+static void dmz_wait_for_free_zones(struct dmz_metadata *zmd)
+{
+ DEFINE_WAIT(wait);
+
+ prepare_to_wait(&zmd->free_wq, &wait, TASK_UNINTERRUPTIBLE);
+ dmz_unlock_map(zmd);
+ dmz_unlock_metadata(zmd);
+
+ io_schedule_timeout(HZ);
+
+ dmz_lock_metadata(zmd);
+ dmz_lock_map(zmd);
+ finish_wait(&zmd->free_wq, &wait);
+}
+
+/*
+ * Lock a zone for reclaim (set the zone RECLAIM bit).
+ * Returns false if the zone cannot be locked or if it is already locked
+ * and 1 otherwise.
+ */
+int dmz_lock_zone_reclaim(struct dm_zone *zone)
+{
+ /* Active zones cannot be reclaimed */
+ if (dmz_is_active(zone))
+ return 0;
+
+ return !test_and_set_bit(DMZ_RECLAIM, &zone->flags);
+}
+
+/*
+ * Clear a zone reclaim flag.
+ */
+void dmz_unlock_zone_reclaim(struct dm_zone *zone)
+{
+ WARN_ON(dmz_is_active(zone));
+ WARN_ON(!dmz_in_reclaim(zone));
+
+ clear_bit_unlock(DMZ_RECLAIM, &zone->flags);
+ smp_mb__after_atomic();
+ wake_up_bit(&zone->flags, DMZ_RECLAIM);
+}
+
+/*
+ * Wait for a zone reclaim to complete.
+ */
+static void dmz_wait_for_reclaim(struct dmz_metadata *zmd, struct dm_zone *zone)
+{
+ dmz_unlock_map(zmd);
+ dmz_unlock_metadata(zmd);
+ wait_on_bit_timeout(&zone->flags, DMZ_RECLAIM, TASK_UNINTERRUPTIBLE, HZ);
+ dmz_lock_metadata(zmd);
+ dmz_lock_map(zmd);
+}
+
+/*
+ * Select a random write zone for reclaim.
+ */
+static struct dm_zone *dmz_get_rnd_zone_for_reclaim(struct dmz_metadata *zmd)
+{
+ struct dm_zone *dzone = NULL;
+ struct dm_zone *zone;
+
+ if (list_empty(&zmd->map_rnd_list))
+ return NULL;
+
+ list_for_each_entry(zone, &zmd->map_rnd_list, link) {
+ if (dmz_is_buf(zone))
+ dzone = zone->bzone;
+ else
+ dzone = zone;
+ if (dmz_lock_zone_reclaim(dzone))
+ return dzone;
+ }
+
+ return NULL;
+}
+
+/*
+ * Select a buffered sequential zone for reclaim.
+ */
+static struct dm_zone *dmz_get_seq_zone_for_reclaim(struct dmz_metadata *zmd)
+{
+ struct dm_zone *zone;
+
+ if (list_empty(&zmd->map_seq_list))
+ return NULL;
+
+ list_for_each_entry(zone, &zmd->map_seq_list, link) {
+ if (!zone->bzone)
+ continue;
+ if (dmz_lock_zone_reclaim(zone))
+ return zone;
+ }
+
+ return NULL;
+}
+
+/*
+ * Select a zone for reclaim.
+ */
+struct dm_zone *dmz_get_zone_for_reclaim(struct dmz_metadata *zmd)
+{
+ struct dm_zone *zone;
+
+ /*
+ * Search for a zone candidate to reclaim: 2 cases are possible.
+ * (1) There is no free sequential zones. Then a random data zone
+ * cannot be reclaimed. So choose a sequential zone to reclaim so
+ * that afterward a random zone can be reclaimed.
+ * (2) At least one free sequential zone is available, then choose
+ * the oldest random zone (data or buffer) that can be locked.
+ */
+ dmz_lock_map(zmd);
+ if (list_empty(&zmd->reserved_seq_zones_list))
+ zone = dmz_get_seq_zone_for_reclaim(zmd);
+ else
+ zone = dmz_get_rnd_zone_for_reclaim(zmd);
+ dmz_unlock_map(zmd);
+
+ return zone;
+}
+
+/*
+ * Activate a zone (increment its reference count).
+ */
+void dmz_activate_zone(struct dm_zone *zone)
+{
+ set_bit(DMZ_ACTIVE, &zone->flags);
+ atomic_inc(&zone->refcount);
+}
+
+/*
+ * Deactivate a zone. This decrement the zone reference counter
+ * and clears the active state of the zone once the count reaches 0,
+ * indicating that all BIOs to the zone have completed. Returns
+ * true if the zone was deactivated.
+ */
+void dmz_deactivate_zone(struct dm_zone *zone)
+{
+ if (atomic_dec_and_test(&zone->refcount)) {
+ WARN_ON(!test_bit(DMZ_ACTIVE, &zone->flags));
+ clear_bit_unlock(DMZ_ACTIVE, &zone->flags);
+ smp_mb__after_atomic();
+ }
+}
+
+/*
+ * Get the zone mapping a chunk, if the chunk is mapped already.
+ * If no mapping exist and the operation is WRITE, a zone is
+ * allocated and used to map the chunk.
+ * The zone returned will be set to the active state.
+ */
+struct dm_zone *dmz_get_chunk_mapping(struct dmz_metadata *zmd, unsigned int chunk, int op)
+{
+ struct dmz_mblock *dmap_mblk = zmd->map_mblk[chunk >> DMZ_MAP_ENTRIES_SHIFT];
+ struct dmz_map *dmap = (struct dmz_map *) dmap_mblk->data;
+ int dmap_idx = chunk & DMZ_MAP_ENTRIES_MASK;
+ unsigned int dzone_id;
+ struct dm_zone *dzone = NULL;
+ int ret = 0;
+
+ dmz_lock_map(zmd);
+again:
+ /* Get the chunk mapping */
+ dzone_id = le32_to_cpu(dmap[dmap_idx].dzone_id);
+ if (dzone_id == DMZ_MAP_UNMAPPED) {
+ /*
+ * Read or discard in unmapped chunks are fine. But for
+ * writes, we need a mapping, so get one.
+ */
+ if (op != REQ_OP_WRITE)
+ goto out;
+
+ /* Alloate a random zone */
+ dzone = dmz_alloc_zone(zmd, DMZ_ALLOC_RND);
+ if (!dzone) {
+ dmz_wait_for_free_zones(zmd);
+ goto again;
+ }
+
+ dmz_map_zone(zmd, dzone, chunk);
+
+ } else {
+ /* The chunk is already mapped: get the mapping zone */
+ dzone = dmz_get(zmd, dzone_id);
+ if (dzone->chunk != chunk) {
+ dzone = ERR_PTR(-EIO);
+ goto out;
+ }
+
+ /* Repair write pointer if the sequential dzone has error */
+ if (dmz_seq_write_err(dzone)) {
+ ret = dmz_handle_seq_write_err(zmd, dzone);
+ if (ret) {
+ dzone = ERR_PTR(-EIO);
+ goto out;
+ }
+ clear_bit(DMZ_SEQ_WRITE_ERR, &dzone->flags);
+ }
+ }
+
+ /*
+ * If the zone is being reclaimed, the chunk mapping may change
+ * to a different zone. So wait for reclaim and retry. Otherwise,
+ * activate the zone (this will prevent reclaim from touching it).
+ */
+ if (dmz_in_reclaim(dzone)) {
+ dmz_wait_for_reclaim(zmd, dzone);
+ goto again;
+ }
+ dmz_activate_zone(dzone);
+ dmz_lru_zone(zmd, dzone);
+out:
+ dmz_unlock_map(zmd);
+
+ return dzone;
+}
+
+/*
+ * Write and discard change the block validity of data zones and their buffer
+ * zones. Check here that valid blocks are still present. If all blocks are
+ * invalid, the zones can be unmapped on the fly without waiting for reclaim
+ * to do it.
+ */
+void dmz_put_chunk_mapping(struct dmz_metadata *zmd, struct dm_zone *dzone)
+{
+ struct dm_zone *bzone;
+
+ dmz_lock_map(zmd);
+
+ bzone = dzone->bzone;
+ if (bzone) {
+ if (dmz_weight(bzone))
+ dmz_lru_zone(zmd, bzone);
+ else {
+ /* Empty buffer zone: reclaim it */
+ dmz_unmap_zone(zmd, bzone);
+ dmz_free_zone(zmd, bzone);
+ bzone = NULL;
+ }
+ }
+
+ /* Deactivate the data zone */
+ dmz_deactivate_zone(dzone);
+ if (dmz_is_active(dzone) || bzone || dmz_weight(dzone))
+ dmz_lru_zone(zmd, dzone);
+ else {
+ /* Unbuffered inactive empty data zone: reclaim it */
+ dmz_unmap_zone(zmd, dzone);
+ dmz_free_zone(zmd, dzone);
+ }
+
+ dmz_unlock_map(zmd);
+}
+
+/*
+ * Allocate and map a random zone to buffer a chunk
+ * already mapped to a sequential zone.
+ */
+struct dm_zone *dmz_get_chunk_buffer(struct dmz_metadata *zmd,
+ struct dm_zone *dzone)
+{
+ struct dm_zone *bzone;
+
+ dmz_lock_map(zmd);
+again:
+ bzone = dzone->bzone;
+ if (bzone)
+ goto out;
+
+ /* Alloate a random zone */
+ bzone = dmz_alloc_zone(zmd, DMZ_ALLOC_RND);
+ if (!bzone) {
+ dmz_wait_for_free_zones(zmd);
+ goto again;
+ }
+
+ /* Update the chunk mapping */
+ dmz_set_chunk_mapping(zmd, dzone->chunk, dmz_id(zmd, dzone),
+ dmz_id(zmd, bzone));
+
+ set_bit(DMZ_BUF, &bzone->flags);
+ bzone->chunk = dzone->chunk;
+ bzone->bzone = dzone;
+ dzone->bzone = bzone;
+ list_add_tail(&bzone->link, &zmd->map_rnd_list);
+out:
+ dmz_unlock_map(zmd);
+
+ return bzone;
+}
+
+/*
+ * Get an unmapped (free) zone.
+ * This must be called with the mapping lock held.
+ */
+struct dm_zone *dmz_alloc_zone(struct dmz_metadata *zmd, unsigned long flags)
+{
+ struct list_head *list;
+ struct dm_zone *zone;
+
+ if (flags & DMZ_ALLOC_RND)
+ list = &zmd->unmap_rnd_list;
+ else
+ list = &zmd->unmap_seq_list;
+again:
+ if (list_empty(list)) {
+ /*
+ * No free zone: if this is for reclaim, allow using the
+ * reserved sequential zones.
+ */
+ if (!(flags & DMZ_ALLOC_RECLAIM) ||
+ list_empty(&zmd->reserved_seq_zones_list))
+ return NULL;
+
+ zone = list_first_entry(&zmd->reserved_seq_zones_list,
+ struct dm_zone, link);
+ list_del_init(&zone->link);
+ atomic_dec(&zmd->nr_reserved_seq_zones);
+ return zone;
+ }
+
+ zone = list_first_entry(list, struct dm_zone, link);
+ list_del_init(&zone->link);
+
+ if (dmz_is_rnd(zone))
+ atomic_dec(&zmd->unmap_nr_rnd);
+ else
+ atomic_dec(&zmd->unmap_nr_seq);
+
+ if (dmz_is_offline(zone)) {
+ dmz_dev_warn(zmd->dev, "Zone %u is offline", dmz_id(zmd, zone));
+ zone = NULL;
+ goto again;
+ }
+
+ return zone;
+}
+
+/*
+ * Free a zone.
+ * This must be called with the mapping lock held.
+ */
+void dmz_free_zone(struct dmz_metadata *zmd, struct dm_zone *zone)
+{
+ /* If this is a sequential zone, reset it */
+ if (dmz_is_seq(zone))
+ dmz_reset_zone(zmd, zone);
+
+ /* Return the zone to its type unmap list */
+ if (dmz_is_rnd(zone)) {
+ list_add_tail(&zone->link, &zmd->unmap_rnd_list);
+ atomic_inc(&zmd->unmap_nr_rnd);
+ } else if (atomic_read(&zmd->nr_reserved_seq_zones) <
+ zmd->nr_reserved_seq) {
+ list_add_tail(&zone->link, &zmd->reserved_seq_zones_list);
+ atomic_inc(&zmd->nr_reserved_seq_zones);
+ } else {
+ list_add_tail(&zone->link, &zmd->unmap_seq_list);
+ atomic_inc(&zmd->unmap_nr_seq);
+ }
+
+ wake_up_all(&zmd->free_wq);
+}
+
+/*
+ * Map a chunk to a zone.
+ * This must be called with the mapping lock held.
+ */
+void dmz_map_zone(struct dmz_metadata *zmd, struct dm_zone *dzone,
+ unsigned int chunk)
+{
+ /* Set the chunk mapping */
+ dmz_set_chunk_mapping(zmd, chunk, dmz_id(zmd, dzone),
+ DMZ_MAP_UNMAPPED);
+ dzone->chunk = chunk;
+ if (dmz_is_rnd(dzone))
+ list_add_tail(&dzone->link, &zmd->map_rnd_list);
+ else
+ list_add_tail(&dzone->link, &zmd->map_seq_list);
+}
+
+/*
+ * Unmap a zone.
+ * This must be called with the mapping lock held.
+ */
+void dmz_unmap_zone(struct dmz_metadata *zmd, struct dm_zone *zone)
+{
+ unsigned int chunk = zone->chunk;
+ unsigned int dzone_id;
+
+ if (chunk == DMZ_MAP_UNMAPPED) {
+ /* Already unmapped */
+ return;
+ }
+
+ if (test_and_clear_bit(DMZ_BUF, &zone->flags)) {
+ /*
+ * Unmapping the chunk buffer zone: clear only
+ * the chunk buffer mapping
+ */
+ dzone_id = dmz_id(zmd, zone->bzone);
+ zone->bzone->bzone = NULL;
+ zone->bzone = NULL;
+
+ } else {
+ /*
+ * Unmapping the chunk data zone: the zone must
+ * not be buffered.
+ */
+ if (WARN_ON(zone->bzone)) {
+ zone->bzone->bzone = NULL;
+ zone->bzone = NULL;
+ }
+ dzone_id = DMZ_MAP_UNMAPPED;
+ }
+
+ dmz_set_chunk_mapping(zmd, chunk, dzone_id, DMZ_MAP_UNMAPPED);
+
+ zone->chunk = DMZ_MAP_UNMAPPED;
+ list_del_init(&zone->link);
+}
+
+/*
+ * Set @nr_bits bits in @bitmap starting from @bit.
+ * Return the number of bits changed from 0 to 1.
+ */
+static unsigned int dmz_set_bits(unsigned long *bitmap,
+ unsigned int bit, unsigned int nr_bits)
+{
+ unsigned long *addr;
+ unsigned int end = bit + nr_bits;
+ unsigned int n = 0;
+
+ while (bit < end) {
+ if (((bit & (BITS_PER_LONG - 1)) == 0) &&
+ ((end - bit) >= BITS_PER_LONG)) {
+ /* Try to set the whole word at once */
+ addr = bitmap + BIT_WORD(bit);
+ if (*addr == 0) {
+ *addr = ULONG_MAX;
+ n += BITS_PER_LONG;
+ bit += BITS_PER_LONG;
+ continue;
+ }
+ }
+
+ if (!test_and_set_bit(bit, bitmap))
+ n++;
+ bit++;
+ }
+
+ return n;
+}
+
+/*
+ * Get the bitmap block storing the bit for chunk_block in zone.
+ */
+static struct dmz_mblock *dmz_get_bitmap(struct dmz_metadata *zmd,
+ struct dm_zone *zone,
+ sector_t chunk_block)
+{
+ sector_t bitmap_block = 1 + zmd->nr_map_blocks +
+ (sector_t)(dmz_id(zmd, zone) * zmd->zone_nr_bitmap_blocks) +
+ (chunk_block >> DMZ_BLOCK_SHIFT_BITS);
+
+ return dmz_get_mblock(zmd, bitmap_block);
+}
+
+/*
+ * Copy the valid blocks bitmap of from_zone to the bitmap of to_zone.
+ */
+int dmz_copy_valid_blocks(struct dmz_metadata *zmd, struct dm_zone *from_zone,
+ struct dm_zone *to_zone)
+{
+ struct dmz_mblock *from_mblk, *to_mblk;
+ sector_t chunk_block = 0;
+
+ /* Get the zones bitmap blocks */
+ while (chunk_block < zmd->dev->zone_nr_blocks) {
+ from_mblk = dmz_get_bitmap(zmd, from_zone, chunk_block);
+ if (IS_ERR(from_mblk))
+ return PTR_ERR(from_mblk);
+ to_mblk = dmz_get_bitmap(zmd, to_zone, chunk_block);
+ if (IS_ERR(to_mblk)) {
+ dmz_release_mblock(zmd, from_mblk);
+ return PTR_ERR(to_mblk);
+ }
+
+ memcpy(to_mblk->data, from_mblk->data, DMZ_BLOCK_SIZE);
+ dmz_dirty_mblock(zmd, to_mblk);
+
+ dmz_release_mblock(zmd, to_mblk);
+ dmz_release_mblock(zmd, from_mblk);
+
+ chunk_block += DMZ_BLOCK_SIZE_BITS;
+ }
+
+ to_zone->weight = from_zone->weight;
+
+ return 0;
+}
+
+/*
+ * Merge the valid blocks bitmap of from_zone into the bitmap of to_zone,
+ * starting from chunk_block.
+ */
+int dmz_merge_valid_blocks(struct dmz_metadata *zmd, struct dm_zone *from_zone,
+ struct dm_zone *to_zone, sector_t chunk_block)
+{
+ unsigned int nr_blocks;
+ int ret;
+
+ /* Get the zones bitmap blocks */
+ while (chunk_block < zmd->dev->zone_nr_blocks) {
+ /* Get a valid region from the source zone */
+ ret = dmz_first_valid_block(zmd, from_zone, &chunk_block);
+ if (ret <= 0)
+ return ret;
+
+ nr_blocks = ret;
+ ret = dmz_validate_blocks(zmd, to_zone, chunk_block, nr_blocks);
+ if (ret)
+ return ret;
+
+ chunk_block += nr_blocks;
+ }
+
+ return 0;
+}
+
+/*
+ * Validate all the blocks in the range [block..block+nr_blocks-1].
+ */
+int dmz_validate_blocks(struct dmz_metadata *zmd, struct dm_zone *zone,
+ sector_t chunk_block, unsigned int nr_blocks)
+{
+ unsigned int count, bit, nr_bits;
+ unsigned int zone_nr_blocks = zmd->dev->zone_nr_blocks;
+ struct dmz_mblock *mblk;
+ unsigned int n = 0;
+
+ dmz_dev_debug(zmd->dev, "=> VALIDATE zone %u, block %llu, %u blocks",
+ dmz_id(zmd, zone), (unsigned long long)chunk_block,
+ nr_blocks);
+
+ WARN_ON(chunk_block + nr_blocks > zone_nr_blocks);
+
+ while (nr_blocks) {
+ /* Get bitmap block */
+ mblk = dmz_get_bitmap(zmd, zone, chunk_block);
+ if (IS_ERR(mblk))
+ return PTR_ERR(mblk);
+
+ /* Set bits */
+ bit = chunk_block & DMZ_BLOCK_MASK_BITS;
+ nr_bits = min(nr_blocks, DMZ_BLOCK_SIZE_BITS - bit);
+
+ count = dmz_set_bits((unsigned long *)mblk->data, bit, nr_bits);
+ if (count) {
+ dmz_dirty_mblock(zmd, mblk);
+ n += count;
+ }
+ dmz_release_mblock(zmd, mblk);
+
+ nr_blocks -= nr_bits;
+ chunk_block += nr_bits;
+ }
+
+ if (likely(zone->weight + n <= zone_nr_blocks))
+ zone->weight += n;
+ else {
+ dmz_dev_warn(zmd->dev, "Zone %u: weight %u should be <= %u",
+ dmz_id(zmd, zone), zone->weight,
+ zone_nr_blocks - n);
+ zone->weight = zone_nr_blocks;
+ }
+
+ return 0;
+}
+
+/*
+ * Clear nr_bits bits in bitmap starting from bit.
+ * Return the number of bits cleared.
+ */
+static int dmz_clear_bits(unsigned long *bitmap, int bit, int nr_bits)
+{
+ unsigned long *addr;
+ int end = bit + nr_bits;
+ int n = 0;
+
+ while (bit < end) {
+ if (((bit & (BITS_PER_LONG - 1)) == 0) &&
+ ((end - bit) >= BITS_PER_LONG)) {
+ /* Try to clear whole word at once */
+ addr = bitmap + BIT_WORD(bit);
+ if (*addr == ULONG_MAX) {
+ *addr = 0;
+ n += BITS_PER_LONG;
+ bit += BITS_PER_LONG;
+ continue;
+ }
+ }
+
+ if (test_and_clear_bit(bit, bitmap))
+ n++;
+ bit++;
+ }
+
+ return n;
+}
+
+/*
+ * Invalidate all the blocks in the range [block..block+nr_blocks-1].
+ */
+int dmz_invalidate_blocks(struct dmz_metadata *zmd, struct dm_zone *zone,
+ sector_t chunk_block, unsigned int nr_blocks)
+{
+ unsigned int count, bit, nr_bits;
+ struct dmz_mblock *mblk;
+ unsigned int n = 0;
+
+ dmz_dev_debug(zmd->dev, "=> INVALIDATE zone %u, block %llu, %u blocks",
+ dmz_id(zmd, zone), (u64)chunk_block, nr_blocks);
+
+ WARN_ON(chunk_block + nr_blocks > zmd->dev->zone_nr_blocks);
+
+ while (nr_blocks) {
+ /* Get bitmap block */
+ mblk = dmz_get_bitmap(zmd, zone, chunk_block);
+ if (IS_ERR(mblk))
+ return PTR_ERR(mblk);
+
+ /* Clear bits */
+ bit = chunk_block & DMZ_BLOCK_MASK_BITS;
+ nr_bits = min(nr_blocks, DMZ_BLOCK_SIZE_BITS - bit);
+
+ count = dmz_clear_bits((unsigned long *)mblk->data,
+ bit, nr_bits);
+ if (count) {
+ dmz_dirty_mblock(zmd, mblk);
+ n += count;
+ }
+ dmz_release_mblock(zmd, mblk);
+
+ nr_blocks -= nr_bits;
+ chunk_block += nr_bits;
+ }
+
+ if (zone->weight >= n)
+ zone->weight -= n;
+ else {
+ dmz_dev_warn(zmd->dev, "Zone %u: weight %u should be >= %u",
+ dmz_id(zmd, zone), zone->weight, n);
+ zone->weight = 0;
+ }
+
+ return 0;
+}
+
+/*
+ * Get a block bit value.
+ */
+static int dmz_test_block(struct dmz_metadata *zmd, struct dm_zone *zone,
+ sector_t chunk_block)
+{
+ struct dmz_mblock *mblk;
+ int ret;
+
+ WARN_ON(chunk_block >= zmd->dev->zone_nr_blocks);
+
+ /* Get bitmap block */
+ mblk = dmz_get_bitmap(zmd, zone, chunk_block);
+ if (IS_ERR(mblk))
+ return PTR_ERR(mblk);
+
+ /* Get offset */
+ ret = test_bit(chunk_block & DMZ_BLOCK_MASK_BITS,
+ (unsigned long *) mblk->data) != 0;
+
+ dmz_release_mblock(zmd, mblk);
+
+ return ret;
+}
+
+/*
+ * Return the number of blocks from chunk_block to the first block with a bit
+ * value specified by set. Search at most nr_blocks blocks from chunk_block.
+ */
+static int dmz_to_next_set_block(struct dmz_metadata *zmd, struct dm_zone *zone,
+ sector_t chunk_block, unsigned int nr_blocks,
+ int set)
+{
+ struct dmz_mblock *mblk;
+ unsigned int bit, set_bit, nr_bits;
+ unsigned long *bitmap;
+ int n = 0;
+
+ WARN_ON(chunk_block + nr_blocks > zmd->dev->zone_nr_blocks);
+
+ while (nr_blocks) {
+ /* Get bitmap block */
+ mblk = dmz_get_bitmap(zmd, zone, chunk_block);
+ if (IS_ERR(mblk))
+ return PTR_ERR(mblk);
+
+ /* Get offset */
+ bitmap = (unsigned long *) mblk->data;
+ bit = chunk_block & DMZ_BLOCK_MASK_BITS;
+ nr_bits = min(nr_blocks, DMZ_BLOCK_SIZE_BITS - bit);
+ if (set)
+ set_bit = find_next_bit(bitmap, DMZ_BLOCK_SIZE_BITS, bit);
+ else
+ set_bit = find_next_zero_bit(bitmap, DMZ_BLOCK_SIZE_BITS, bit);
+ dmz_release_mblock(zmd, mblk);
+
+ n += set_bit - bit;
+ if (set_bit < DMZ_BLOCK_SIZE_BITS)
+ break;
+
+ nr_blocks -= nr_bits;
+ chunk_block += nr_bits;
+ }
+
+ return n;
+}
+
+/*
+ * Test if chunk_block is valid. If it is, the number of consecutive
+ * valid blocks from chunk_block will be returned.
+ */
+int dmz_block_valid(struct dmz_metadata *zmd, struct dm_zone *zone,
+ sector_t chunk_block)
+{
+ int valid;
+
+ valid = dmz_test_block(zmd, zone, chunk_block);
+ if (valid <= 0)
+ return valid;
+
+ /* The block is valid: get the number of valid blocks from block */
+ return dmz_to_next_set_block(zmd, zone, chunk_block,
+ zmd->dev->zone_nr_blocks - chunk_block, 0);
+}
+
+/*
+ * Find the first valid block from @chunk_block in @zone.
+ * If such a block is found, its number is returned using
+ * @chunk_block and the total number of valid blocks from @chunk_block
+ * is returned.
+ */
+int dmz_first_valid_block(struct dmz_metadata *zmd, struct dm_zone *zone,
+ sector_t *chunk_block)
+{
+ sector_t start_block = *chunk_block;
+ int ret;
+
+ ret = dmz_to_next_set_block(zmd, zone, start_block,
+ zmd->dev->zone_nr_blocks - start_block, 1);
+ if (ret < 0)
+ return ret;
+
+ start_block += ret;
+ *chunk_block = start_block;
+
+ return dmz_to_next_set_block(zmd, zone, start_block,
+ zmd->dev->zone_nr_blocks - start_block, 0);
+}
+
+/*
+ * Count the number of bits set starting from bit up to bit + nr_bits - 1.
+ */
+static int dmz_count_bits(void *bitmap, int bit, int nr_bits)
+{
+ unsigned long *addr;
+ int end = bit + nr_bits;
+ int n = 0;
+
+ while (bit < end) {
+ if (((bit & (BITS_PER_LONG - 1)) == 0) &&
+ ((end - bit) >= BITS_PER_LONG)) {
+ addr = (unsigned long *)bitmap + BIT_WORD(bit);
+ if (*addr == ULONG_MAX) {
+ n += BITS_PER_LONG;
+ bit += BITS_PER_LONG;
+ continue;
+ }
+ }
+
+ if (test_bit(bit, bitmap))
+ n++;
+ bit++;
+ }
+
+ return n;
+}
+
+/*
+ * Get a zone weight.
+ */
+static void dmz_get_zone_weight(struct dmz_metadata *zmd, struct dm_zone *zone)
+{
+ struct dmz_mblock *mblk;
+ sector_t chunk_block = 0;
+ unsigned int bit, nr_bits;
+ unsigned int nr_blocks = zmd->dev->zone_nr_blocks;
+ void *bitmap;
+ int n = 0;
+
+ while (nr_blocks) {
+ /* Get bitmap block */
+ mblk = dmz_get_bitmap(zmd, zone, chunk_block);
+ if (IS_ERR(mblk)) {
+ n = 0;
+ break;
+ }
+
+ /* Count bits in this block */
+ bitmap = mblk->data;
+ bit = chunk_block & DMZ_BLOCK_MASK_BITS;
+ nr_bits = min(nr_blocks, DMZ_BLOCK_SIZE_BITS - bit);
+ n += dmz_count_bits(bitmap, bit, nr_bits);
+
+ dmz_release_mblock(zmd, mblk);
+
+ nr_blocks -= nr_bits;
+ chunk_block += nr_bits;
+ }
+
+ zone->weight = n;
+}
+
+/*
+ * Cleanup the zoned metadata resources.
+ */
+static void dmz_cleanup_metadata(struct dmz_metadata *zmd)
+{
+ struct rb_root *root;
+ struct dmz_mblock *mblk, *next;
+ int i;
+
+ /* Release zone mapping resources */
+ if (zmd->map_mblk) {
+ for (i = 0; i < zmd->nr_map_blocks; i++)
+ dmz_release_mblock(zmd, zmd->map_mblk[i]);
+ kfree(zmd->map_mblk);
+ zmd->map_mblk = NULL;
+ }
+
+ /* Release super blocks */
+ for (i = 0; i < 2; i++) {
+ if (zmd->sb[i].mblk) {
+ dmz_free_mblock(zmd, zmd->sb[i].mblk);
+ zmd->sb[i].mblk = NULL;
+ }
+ }
+
+ /* Free cached blocks */
+ while (!list_empty(&zmd->mblk_dirty_list)) {
+ mblk = list_first_entry(&zmd->mblk_dirty_list,
+ struct dmz_mblock, link);
+ dmz_dev_warn(zmd->dev, "mblock %llu still in dirty list (ref %u)",
+ (u64)mblk->no, atomic_read(&mblk->ref));
+ list_del_init(&mblk->link);
+ rb_erase(&mblk->node, &zmd->mblk_rbtree);
+ dmz_free_mblock(zmd, mblk);
+ }
+
+ while (!list_empty(&zmd->mblk_lru_list)) {
+ mblk = list_first_entry(&zmd->mblk_lru_list,
+ struct dmz_mblock, link);
+ list_del_init(&mblk->link);
+ rb_erase(&mblk->node, &zmd->mblk_rbtree);
+ dmz_free_mblock(zmd, mblk);
+ }
+
+ /* Sanity checks: the mblock rbtree should now be empty */
+ root = &zmd->mblk_rbtree;
+ rbtree_postorder_for_each_entry_safe(mblk, next, root, node) {
+ dmz_dev_warn(zmd->dev, "mblock %llu ref %u still in rbtree",
+ (u64)mblk->no, atomic_read(&mblk->ref));
+ atomic_set(&mblk->ref, 0);
+ dmz_free_mblock(zmd, mblk);
+ }
+
+ /* Free the zone descriptors */
+ dmz_drop_zones(zmd);
+}
+
+/*
+ * Initialize the zoned metadata.
+ */
+int dmz_ctr_metadata(struct dmz_dev *dev, struct dmz_metadata **metadata)
+{
+ struct dmz_metadata *zmd;
+ unsigned int i, zid;
+ struct dm_zone *zone;
+ int ret;
+
+ zmd = kzalloc(sizeof(struct dmz_metadata), GFP_KERNEL);
+ if (!zmd)
+ return -ENOMEM;
+
+ zmd->dev = dev;
+ zmd->mblk_rbtree = RB_ROOT;
+ init_rwsem(&zmd->mblk_sem);
+ mutex_init(&zmd->mblk_flush_lock);
+ spin_lock_init(&zmd->mblk_lock);
+ INIT_LIST_HEAD(&zmd->mblk_lru_list);
+ INIT_LIST_HEAD(&zmd->mblk_dirty_list);
+
+ mutex_init(&zmd->map_lock);
+ atomic_set(&zmd->unmap_nr_rnd, 0);
+ INIT_LIST_HEAD(&zmd->unmap_rnd_list);
+ INIT_LIST_HEAD(&zmd->map_rnd_list);
+
+ atomic_set(&zmd->unmap_nr_seq, 0);
+ INIT_LIST_HEAD(&zmd->unmap_seq_list);
+ INIT_LIST_HEAD(&zmd->map_seq_list);
+
+ atomic_set(&zmd->nr_reserved_seq_zones, 0);
+ INIT_LIST_HEAD(&zmd->reserved_seq_zones_list);
+
+ init_waitqueue_head(&zmd->free_wq);
+
+ /* Initialize zone descriptors */
+ ret = dmz_init_zones(zmd);
+ if (ret)
+ goto err;
+
+ /* Get super block */
+ ret = dmz_load_sb(zmd);
+ if (ret)
+ goto err;
+
+ /* Set metadata zones starting from sb_zone */
+ zid = dmz_id(zmd, zmd->sb_zone);
+ for (i = 0; i < zmd->nr_meta_zones << 1; i++) {
+ zone = dmz_get(zmd, zid + i);
+ if (!dmz_is_rnd(zone))
+ goto err;
+ set_bit(DMZ_META, &zone->flags);
+ }
+
+ /* Load mapping table */
+ ret = dmz_load_mapping(zmd);
+ if (ret)
+ goto err;
+
+ /*
+ * Cache size boundaries: allow at least 2 super blocks, the chunk map
+ * blocks and enough blocks to be able to cache the bitmap blocks of
+ * up to 16 zones when idle (min_nr_mblks). Otherwise, if busy, allow
+ * the cache to add 512 more metadata blocks.
+ */
+ zmd->min_nr_mblks = 2 + zmd->nr_map_blocks + zmd->zone_nr_bitmap_blocks * 16;
+ zmd->max_nr_mblks = zmd->min_nr_mblks + 512;
+ zmd->mblk_shrinker.count_objects = dmz_mblock_shrinker_count;
+ zmd->mblk_shrinker.scan_objects = dmz_mblock_shrinker_scan;
+ zmd->mblk_shrinker.seeks = DEFAULT_SEEKS;
+
+ /* Metadata cache shrinker */
+ ret = register_shrinker(&zmd->mblk_shrinker);
+ if (ret) {
+ dmz_dev_err(dev, "Register metadata cache shrinker failed");
+ goto err;
+ }
+
+ dmz_dev_info(dev, "Host-%s zoned block device",
+ bdev_zoned_model(dev->bdev) == BLK_ZONED_HA ?
+ "aware" : "managed");
+ dmz_dev_info(dev, " %llu 512-byte logical sectors",
+ (u64)dev->capacity);
+ dmz_dev_info(dev, " %u zones of %llu 512-byte logical sectors",
+ dev->nr_zones, (u64)dev->zone_nr_sectors);
+ dmz_dev_info(dev, " %u metadata zones",
+ zmd->nr_meta_zones * 2);
+ dmz_dev_info(dev, " %u data zones for %u chunks",
+ zmd->nr_data_zones, zmd->nr_chunks);
+ dmz_dev_info(dev, " %u random zones (%u unmapped)",
+ zmd->nr_rnd, atomic_read(&zmd->unmap_nr_rnd));
+ dmz_dev_info(dev, " %u sequential zones (%u unmapped)",
+ zmd->nr_seq, atomic_read(&zmd->unmap_nr_seq));
+ dmz_dev_info(dev, " %u reserved sequential data zones",
+ zmd->nr_reserved_seq);
+
+ dmz_dev_debug(dev, "Format:");
+ dmz_dev_debug(dev, "%u metadata blocks per set (%u max cache)",
+ zmd->nr_meta_blocks, zmd->max_nr_mblks);
+ dmz_dev_debug(dev, " %u data zone mapping blocks",
+ zmd->nr_map_blocks);
+ dmz_dev_debug(dev, " %u bitmap blocks",
+ zmd->nr_bitmap_blocks);
+
+ *metadata = zmd;
+
+ return 0;
+err:
+ dmz_cleanup_metadata(zmd);
+ kfree(zmd);
+ *metadata = NULL;
+
+ return ret;
+}
+
+/*
+ * Cleanup the zoned metadata resources.
+ */
+void dmz_dtr_metadata(struct dmz_metadata *zmd)
+{
+ unregister_shrinker(&zmd->mblk_shrinker);
+ dmz_cleanup_metadata(zmd);
+ kfree(zmd);
+}
+
+/*
+ * Check zone information on resume.
+ */
+int dmz_resume_metadata(struct dmz_metadata *zmd)
+{
+ struct dmz_dev *dev = zmd->dev;
+ struct dm_zone *zone;
+ sector_t wp_block;
+ unsigned int i;
+ int ret;
+
+ /* Check zones */
+ for (i = 0; i < dev->nr_zones; i++) {
+ zone = dmz_get(zmd, i);
+ if (!zone) {
+ dmz_dev_err(dev, "Unable to get zone %u", i);
+ return -EIO;
+ }
+
+ wp_block = zone->wp_block;
+
+ ret = dmz_update_zone(zmd, zone);
+ if (ret) {
+ dmz_dev_err(dev, "Broken zone %u", i);
+ return ret;
+ }
+
+ if (dmz_is_offline(zone)) {
+ dmz_dev_warn(dev, "Zone %u is offline", i);
+ continue;
+ }
+
+ /* Check write pointer */
+ if (!dmz_is_seq(zone))
+ zone->wp_block = 0;
+ else if (zone->wp_block != wp_block) {
+ dmz_dev_err(dev, "Zone %u: Invalid wp (%llu / %llu)",
+ i, (u64)zone->wp_block, (u64)wp_block);
+ zone->wp_block = wp_block;
+ dmz_invalidate_blocks(zmd, zone, zone->wp_block,
+ dev->zone_nr_blocks - zone->wp_block);
+ }
+ }
+
+ return 0;
+}
diff --git a/drivers/md/dm-zoned-reclaim.c b/drivers/md/dm-zoned-reclaim.c
new file mode 100644
index 000000000000..05c0a126f5c8
--- /dev/null
+++ b/drivers/md/dm-zoned-reclaim.c
@@ -0,0 +1,570 @@
+/*
+ * Copyright (C) 2017 Western Digital Corporation or its affiliates.
+ *
+ * This file is released under the GPL.
+ */
+
+#include "dm-zoned.h"
+
+#include <linux/module.h>
+
+#define DM_MSG_PREFIX "zoned reclaim"
+
+struct dmz_reclaim {
+ struct dmz_metadata *metadata;
+ struct dmz_dev *dev;
+
+ struct delayed_work work;
+ struct workqueue_struct *wq;
+
+ struct dm_kcopyd_client *kc;
+ struct dm_kcopyd_throttle kc_throttle;
+ int kc_err;
+
+ unsigned long flags;
+
+ /* Last target access time */
+ unsigned long atime;
+};
+
+/*
+ * Reclaim state flags.
+ */
+enum {
+ DMZ_RECLAIM_KCOPY,
+};
+
+/*
+ * Number of seconds of target BIO inactivity to consider the target idle.
+ */
+#define DMZ_IDLE_PERIOD (10UL * HZ)
+
+/*
+ * Percentage of unmapped (free) random zones below which reclaim starts
+ * even if the target is busy.
+ */
+#define DMZ_RECLAIM_LOW_UNMAP_RND 30
+
+/*
+ * Percentage of unmapped (free) random zones above which reclaim will
+ * stop if the target is busy.
+ */
+#define DMZ_RECLAIM_HIGH_UNMAP_RND 50
+
+/*
+ * Align a sequential zone write pointer to chunk_block.
+ */
+static int dmz_reclaim_align_wp(struct dmz_reclaim *zrc, struct dm_zone *zone,
+ sector_t block)
+{
+ struct dmz_metadata *zmd = zrc->metadata;
+ sector_t wp_block = zone->wp_block;
+ unsigned int nr_blocks;
+ int ret;
+
+ if (wp_block == block)
+ return 0;
+
+ if (wp_block > block)
+ return -EIO;
+
+ /*
+ * Zeroout the space between the write
+ * pointer and the requested position.
+ */
+ nr_blocks = block - wp_block;
+ ret = blkdev_issue_zeroout(zrc->dev->bdev,
+ dmz_start_sect(zmd, zone) + dmz_blk2sect(wp_block),
+ dmz_blk2sect(nr_blocks), GFP_NOFS, false);
+ if (ret) {
+ dmz_dev_err(zrc->dev,
+ "Align zone %u wp %llu to %llu (wp+%u) blocks failed %d",
+ dmz_id(zmd, zone), (unsigned long long)wp_block,
+ (unsigned long long)block, nr_blocks, ret);
+ return ret;
+ }
+
+ zone->wp_block = block;
+
+ return 0;
+}
+
+/*
+ * dm_kcopyd_copy end notification.
+ */
+static void dmz_reclaim_kcopy_end(int read_err, unsigned long write_err,
+ void *context)
+{
+ struct dmz_reclaim *zrc = context;
+
+ if (read_err || write_err)
+ zrc->kc_err = -EIO;
+ else
+ zrc->kc_err = 0;
+
+ clear_bit_unlock(DMZ_RECLAIM_KCOPY, &zrc->flags);
+ smp_mb__after_atomic();
+ wake_up_bit(&zrc->flags, DMZ_RECLAIM_KCOPY);
+}
+
+/*
+ * Copy valid blocks of src_zone into dst_zone.
+ */
+static int dmz_reclaim_copy(struct dmz_reclaim *zrc,
+ struct dm_zone *src_zone, struct dm_zone *dst_zone)
+{
+ struct dmz_metadata *zmd = zrc->metadata;
+ struct dmz_dev *dev = zrc->dev;
+ struct dm_io_region src, dst;
+ sector_t block = 0, end_block;
+ sector_t nr_blocks;
+ sector_t src_zone_block;
+ sector_t dst_zone_block;
+ unsigned long flags = 0;
+ int ret;
+
+ if (dmz_is_seq(src_zone))
+ end_block = src_zone->wp_block;
+ else
+ end_block = dev->zone_nr_blocks;
+ src_zone_block = dmz_start_block(zmd, src_zone);
+ dst_zone_block = dmz_start_block(zmd, dst_zone);
+
+ if (dmz_is_seq(dst_zone))
+ set_bit(DM_KCOPYD_WRITE_SEQ, &flags);
+
+ while (block < end_block) {
+ /* Get a valid region from the source zone */
+ ret = dmz_first_valid_block(zmd, src_zone, &block);
+ if (ret <= 0)
+ return ret;
+ nr_blocks = ret;
+
+ /*
+ * If we are writing in a sequential zone, we must make sure
+ * that writes are sequential. So Zeroout any eventual hole
+ * between writes.
+ */
+ if (dmz_is_seq(dst_zone)) {
+ ret = dmz_reclaim_align_wp(zrc, dst_zone, block);
+ if (ret)
+ return ret;
+ }
+
+ src.bdev = dev->bdev;
+ src.sector = dmz_blk2sect(src_zone_block + block);
+ src.count = dmz_blk2sect(nr_blocks);
+
+ dst.bdev = dev->bdev;
+ dst.sector = dmz_blk2sect(dst_zone_block + block);
+ dst.count = src.count;
+
+ /* Copy the valid region */
+ set_bit(DMZ_RECLAIM_KCOPY, &zrc->flags);
+ ret = dm_kcopyd_copy(zrc->kc, &src, 1, &dst, flags,
+ dmz_reclaim_kcopy_end, zrc);
+ if (ret)
+ return ret;
+
+ /* Wait for copy to complete */
+ wait_on_bit_io(&zrc->flags, DMZ_RECLAIM_KCOPY,
+ TASK_UNINTERRUPTIBLE);
+ if (zrc->kc_err)
+ return zrc->kc_err;
+
+ block += nr_blocks;
+ if (dmz_is_seq(dst_zone))
+ dst_zone->wp_block = block;
+ }
+
+ return 0;
+}
+
+/*
+ * Move valid blocks of dzone buffer zone into dzone (after its write pointer)
+ * and free the buffer zone.
+ */
+static int dmz_reclaim_buf(struct dmz_reclaim *zrc, struct dm_zone *dzone)
+{
+ struct dm_zone *bzone = dzone->bzone;
+ sector_t chunk_block = dzone->wp_block;
+ struct dmz_metadata *zmd = zrc->metadata;
+ int ret;
+
+ dmz_dev_debug(zrc->dev,
+ "Chunk %u, move buf zone %u (weight %u) to data zone %u (weight %u)",
+ dzone->chunk, dmz_id(zmd, bzone), dmz_weight(bzone),
+ dmz_id(zmd, dzone), dmz_weight(dzone));
+
+ /* Flush data zone into the buffer zone */
+ ret = dmz_reclaim_copy(zrc, bzone, dzone);
+ if (ret < 0)
+ return ret;
+
+ dmz_lock_flush(zmd);
+
+ /* Validate copied blocks */
+ ret = dmz_merge_valid_blocks(zmd, bzone, dzone, chunk_block);
+ if (ret == 0) {
+ /* Free the buffer zone */
+ dmz_invalidate_blocks(zmd, bzone, 0, zrc->dev->zone_nr_blocks);
+ dmz_lock_map(zmd);
+ dmz_unmap_zone(zmd, bzone);
+ dmz_unlock_zone_reclaim(dzone);
+ dmz_free_zone(zmd, bzone);
+ dmz_unlock_map(zmd);
+ }
+
+ dmz_unlock_flush(zmd);
+
+ return 0;
+}
+
+/*
+ * Merge valid blocks of dzone into its buffer zone and free dzone.
+ */
+static int dmz_reclaim_seq_data(struct dmz_reclaim *zrc, struct dm_zone *dzone)
+{
+ unsigned int chunk = dzone->chunk;
+ struct dm_zone *bzone = dzone->bzone;
+ struct dmz_metadata *zmd = zrc->metadata;
+ int ret = 0;
+
+ dmz_dev_debug(zrc->dev,
+ "Chunk %u, move data zone %u (weight %u) to buf zone %u (weight %u)",
+ chunk, dmz_id(zmd, dzone), dmz_weight(dzone),
+ dmz_id(zmd, bzone), dmz_weight(bzone));
+
+ /* Flush data zone into the buffer zone */
+ ret = dmz_reclaim_copy(zrc, dzone, bzone);
+ if (ret < 0)
+ return ret;
+
+ dmz_lock_flush(zmd);
+
+ /* Validate copied blocks */
+ ret = dmz_merge_valid_blocks(zmd, dzone, bzone, 0);
+ if (ret == 0) {
+ /*
+ * Free the data zone and remap the chunk to
+ * the buffer zone.
+ */
+ dmz_invalidate_blocks(zmd, dzone, 0, zrc->dev->zone_nr_blocks);
+ dmz_lock_map(zmd);
+ dmz_unmap_zone(zmd, bzone);
+ dmz_unmap_zone(zmd, dzone);
+ dmz_unlock_zone_reclaim(dzone);
+ dmz_free_zone(zmd, dzone);
+ dmz_map_zone(zmd, bzone, chunk);
+ dmz_unlock_map(zmd);
+ }
+
+ dmz_unlock_flush(zmd);
+
+ return 0;
+}
+
+/*
+ * Move valid blocks of the random data zone dzone into a free sequential zone.
+ * Once blocks are moved, remap the zone chunk to the sequential zone.
+ */
+static int dmz_reclaim_rnd_data(struct dmz_reclaim *zrc, struct dm_zone *dzone)
+{
+ unsigned int chunk = dzone->chunk;
+ struct dm_zone *szone = NULL;
+ struct dmz_metadata *zmd = zrc->metadata;
+ int ret;
+
+ /* Get a free sequential zone */
+ dmz_lock_map(zmd);
+ szone = dmz_alloc_zone(zmd, DMZ_ALLOC_RECLAIM);
+ dmz_unlock_map(zmd);
+ if (!szone)
+ return -ENOSPC;
+
+ dmz_dev_debug(zrc->dev,
+ "Chunk %u, move rnd zone %u (weight %u) to seq zone %u",
+ chunk, dmz_id(zmd, dzone), dmz_weight(dzone),
+ dmz_id(zmd, szone));
+
+ /* Flush the random data zone into the sequential zone */
+ ret = dmz_reclaim_copy(zrc, dzone, szone);
+
+ dmz_lock_flush(zmd);
+
+ if (ret == 0) {
+ /* Validate copied blocks */
+ ret = dmz_copy_valid_blocks(zmd, dzone, szone);
+ }
+ if (ret) {
+ /* Free the sequential zone */
+ dmz_lock_map(zmd);
+ dmz_free_zone(zmd, szone);
+ dmz_unlock_map(zmd);
+ } else {
+ /* Free the data zone and remap the chunk */
+ dmz_invalidate_blocks(zmd, dzone, 0, zrc->dev->zone_nr_blocks);
+ dmz_lock_map(zmd);
+ dmz_unmap_zone(zmd, dzone);
+ dmz_unlock_zone_reclaim(dzone);
+ dmz_free_zone(zmd, dzone);
+ dmz_map_zone(zmd, szone, chunk);
+ dmz_unlock_map(zmd);
+ }
+
+ dmz_unlock_flush(zmd);
+
+ return 0;
+}
+
+/*
+ * Reclaim an empty zone.
+ */
+static void dmz_reclaim_empty(struct dmz_reclaim *zrc, struct dm_zone *dzone)
+{
+ struct dmz_metadata *zmd = zrc->metadata;
+
+ dmz_lock_flush(zmd);
+ dmz_lock_map(zmd);
+ dmz_unmap_zone(zmd, dzone);
+ dmz_unlock_zone_reclaim(dzone);
+ dmz_free_zone(zmd, dzone);
+ dmz_unlock_map(zmd);
+ dmz_unlock_flush(zmd);
+}
+
+/*
+ * Find a candidate zone for reclaim and process it.
+ */
+static void dmz_reclaim(struct dmz_reclaim *zrc)
+{
+ struct dmz_metadata *zmd = zrc->metadata;
+ struct dm_zone *dzone;
+ struct dm_zone *rzone;
+ unsigned long start;
+ int ret;
+
+ /* Get a data zone */
+ dzone = dmz_get_zone_for_reclaim(zmd);
+ if (!dzone)
+ return;
+
+ start = jiffies;
+
+ if (dmz_is_rnd(dzone)) {
+ if (!dmz_weight(dzone)) {
+ /* Empty zone */
+ dmz_reclaim_empty(zrc, dzone);
+ ret = 0;
+ } else {
+ /*
+ * Reclaim the random data zone by moving its
+ * valid data blocks to a free sequential zone.
+ */
+ ret = dmz_reclaim_rnd_data(zrc, dzone);
+ }
+ rzone = dzone;
+
+ } else {
+ struct dm_zone *bzone = dzone->bzone;
+ sector_t chunk_block = 0;
+
+ ret = dmz_first_valid_block(zmd, bzone, &chunk_block);
+ if (ret < 0)
+ goto out;
+
+ if (ret == 0 || chunk_block >= dzone->wp_block) {
+ /*
+ * The buffer zone is empty or its valid blocks are
+ * after the data zone write pointer.
+ */
+ ret = dmz_reclaim_buf(zrc, dzone);
+ rzone = bzone;
+ } else {
+ /*
+ * Reclaim the data zone by merging it into the
+ * buffer zone so that the buffer zone itself can
+ * be later reclaimed.
+ */
+ ret = dmz_reclaim_seq_data(zrc, dzone);
+ rzone = dzone;
+ }
+ }
+out:
+ if (ret) {
+ dmz_unlock_zone_reclaim(dzone);
+ return;
+ }
+
+ (void) dmz_flush_metadata(zrc->metadata);
+
+ dmz_dev_debug(zrc->dev, "Reclaimed zone %u in %u ms",
+ dmz_id(zmd, rzone), jiffies_to_msecs(jiffies - start));
+}
+
+/*
+ * Test if the target device is idle.
+ */
+static inline int dmz_target_idle(struct dmz_reclaim *zrc)
+{
+ return time_is_before_jiffies(zrc->atime + DMZ_IDLE_PERIOD);
+}
+
+/*
+ * Test if reclaim is necessary.
+ */
+static bool dmz_should_reclaim(struct dmz_reclaim *zrc)
+{
+ struct dmz_metadata *zmd = zrc->metadata;
+ unsigned int nr_rnd = dmz_nr_rnd_zones(zmd);
+ unsigned int nr_unmap_rnd = dmz_nr_unmap_rnd_zones(zmd);
+ unsigned int p_unmap_rnd = nr_unmap_rnd * 100 / nr_rnd;
+
+ /* Reclaim when idle */
+ if (dmz_target_idle(zrc) && nr_unmap_rnd < nr_rnd)
+ return true;
+
+ /* If there are still plenty of random zones, do not reclaim */
+ if (p_unmap_rnd >= DMZ_RECLAIM_HIGH_UNMAP_RND)
+ return false;
+
+ /*
+ * If the percentage of unmappped random zones is low,
+ * reclaim even if the target is busy.
+ */
+ return p_unmap_rnd <= DMZ_RECLAIM_LOW_UNMAP_RND;
+}
+
+/*
+ * Reclaim work function.
+ */
+static void dmz_reclaim_work(struct work_struct *work)
+{
+ struct dmz_reclaim *zrc = container_of(work, struct dmz_reclaim, work.work);
+ struct dmz_metadata *zmd = zrc->metadata;
+ unsigned int nr_rnd, nr_unmap_rnd;
+ unsigned int p_unmap_rnd;
+
+ if (!dmz_should_reclaim(zrc)) {
+ mod_delayed_work(zrc->wq, &zrc->work, DMZ_IDLE_PERIOD);
+ return;
+ }
+
+ /*
+ * We need to start reclaiming random zones: set up zone copy
+ * throttling to either go fast if we are very low on random zones
+ * and slower if there are still some free random zones to avoid
+ * as much as possible to negatively impact the user workload.
+ */
+ nr_rnd = dmz_nr_rnd_zones(zmd);
+ nr_unmap_rnd = dmz_nr_unmap_rnd_zones(zmd);
+ p_unmap_rnd = nr_unmap_rnd * 100 / nr_rnd;
+ if (dmz_target_idle(zrc) || p_unmap_rnd < DMZ_RECLAIM_LOW_UNMAP_RND / 2) {
+ /* Idle or very low percentage: go fast */
+ zrc->kc_throttle.throttle = 100;
+ } else {
+ /* Busy but we still have some random zone: throttle */
+ zrc->kc_throttle.throttle = min(75U, 100U - p_unmap_rnd / 2);
+ }
+
+ dmz_dev_debug(zrc->dev,
+ "Reclaim (%u): %s, %u%% free rnd zones (%u/%u)",
+ zrc->kc_throttle.throttle,
+ (dmz_target_idle(zrc) ? "Idle" : "Busy"),
+ p_unmap_rnd, nr_unmap_rnd, nr_rnd);
+
+ dmz_reclaim(zrc);
+
+ dmz_schedule_reclaim(zrc);
+}
+
+/*
+ * Initialize reclaim.
+ */
+int dmz_ctr_reclaim(struct dmz_dev *dev, struct dmz_metadata *zmd,
+ struct dmz_reclaim **reclaim)
+{
+ struct dmz_reclaim *zrc;
+ int ret;
+
+ zrc = kzalloc(sizeof(struct dmz_reclaim), GFP_KERNEL);
+ if (!zrc)
+ return -ENOMEM;
+
+ zrc->dev = dev;
+ zrc->metadata = zmd;
+ zrc->atime = jiffies;
+
+ /* Reclaim kcopyd client */
+ zrc->kc = dm_kcopyd_client_create(&zrc->kc_throttle);
+ if (IS_ERR(zrc->kc)) {
+ ret = PTR_ERR(zrc->kc);
+ zrc->kc = NULL;
+ goto err;
+ }
+
+ /* Reclaim work */
+ INIT_DELAYED_WORK(&zrc->work, dmz_reclaim_work);
+ zrc->wq = alloc_ordered_workqueue("dmz_rwq_%s", WQ_MEM_RECLAIM,
+ dev->name);
+ if (!zrc->wq) {
+ ret = -ENOMEM;
+ goto err;
+ }
+
+ *reclaim = zrc;
+ queue_delayed_work(zrc->wq, &zrc->work, 0);
+
+ return 0;
+err:
+ if (zrc->kc)
+ dm_kcopyd_client_destroy(zrc->kc);
+ kfree(zrc);
+
+ return ret;
+}
+
+/*
+ * Terminate reclaim.
+ */
+void dmz_dtr_reclaim(struct dmz_reclaim *zrc)
+{
+ cancel_delayed_work_sync(&zrc->work);
+ destroy_workqueue(zrc->wq);
+ dm_kcopyd_client_destroy(zrc->kc);
+ kfree(zrc);
+}
+
+/*
+ * Suspend reclaim.
+ */
+void dmz_suspend_reclaim(struct dmz_reclaim *zrc)
+{
+ cancel_delayed_work_sync(&zrc->work);
+}
+
+/*
+ * Resume reclaim.
+ */
+void dmz_resume_reclaim(struct dmz_reclaim *zrc)
+{
+ queue_delayed_work(zrc->wq, &zrc->work, DMZ_IDLE_PERIOD);
+}
+
+/*
+ * BIO accounting.
+ */
+void dmz_reclaim_bio_acc(struct dmz_reclaim *zrc)
+{
+ zrc->atime = jiffies;
+}
+
+/*
+ * Start reclaim if necessary.
+ */
+void dmz_schedule_reclaim(struct dmz_reclaim *zrc)
+{
+ if (dmz_should_reclaim(zrc))
+ mod_delayed_work(zrc->wq, &zrc->work, 0);
+}
+
diff --git a/drivers/md/dm-zoned-target.c b/drivers/md/dm-zoned-target.c
new file mode 100644
index 000000000000..2b538fa817f4
--- /dev/null
+++ b/drivers/md/dm-zoned-target.c
@@ -0,0 +1,967 @@
+/*
+ * Copyright (C) 2017 Western Digital Corporation or its affiliates.
+ *
+ * This file is released under the GPL.
+ */
+
+#include "dm-zoned.h"
+
+#include <linux/module.h>
+
+#define DM_MSG_PREFIX "zoned"
+
+#define DMZ_MIN_BIOS 8192
+
+/*
+ * Zone BIO context.
+ */
+struct dmz_bioctx {
+ struct dmz_target *target;
+ struct dm_zone *zone;
+ struct bio *bio;
+ atomic_t ref;
+ blk_status_t status;
+};
+
+/*
+ * Chunk work descriptor.
+ */
+struct dm_chunk_work {
+ struct work_struct work;
+ atomic_t refcount;
+ struct dmz_target *target;
+ unsigned int chunk;
+ struct bio_list bio_list;
+};
+
+/*
+ * Target descriptor.
+ */
+struct dmz_target {
+ struct dm_dev *ddev;
+
+ unsigned long flags;
+
+ /* Zoned block device information */
+ struct dmz_dev *dev;
+
+ /* For metadata handling */
+ struct dmz_metadata *metadata;
+
+ /* For reclaim */
+ struct dmz_reclaim *reclaim;
+
+ /* For chunk work */
+ struct mutex chunk_lock;
+ struct radix_tree_root chunk_rxtree;
+ struct workqueue_struct *chunk_wq;
+
+ /* For cloned BIOs to zones */
+ struct bio_set *bio_set;
+
+ /* For flush */
+ spinlock_t flush_lock;
+ struct bio_list flush_list;
+ struct delayed_work flush_work;
+ struct workqueue_struct *flush_wq;
+};
+
+/*
+ * Flush intervals (seconds).
+ */
+#define DMZ_FLUSH_PERIOD (10 * HZ)
+
+/*
+ * Target BIO completion.
+ */
+static inline void dmz_bio_endio(struct bio *bio, blk_status_t status)
+{
+ struct dmz_bioctx *bioctx = dm_per_bio_data(bio, sizeof(struct dmz_bioctx));
+
+ if (bioctx->status == BLK_STS_OK && status != BLK_STS_OK)
+ bioctx->status = status;
+ bio_endio(bio);
+}
+
+/*
+ * Partial clone read BIO completion callback. This terminates the
+ * target BIO when there are no more references to its context.
+ */
+static void dmz_read_bio_end_io(struct bio *bio)
+{
+ struct dmz_bioctx *bioctx = bio->bi_private;
+ blk_status_t status = bio->bi_status;
+
+ bio_put(bio);
+ dmz_bio_endio(bioctx->bio, status);
+}
+
+/*
+ * Issue a BIO to a zone. The BIO may only partially process the
+ * original target BIO.
+ */
+static int dmz_submit_read_bio(struct dmz_target *dmz, struct dm_zone *zone,
+ struct bio *bio, sector_t chunk_block,
+ unsigned int nr_blocks)
+{
+ struct dmz_bioctx *bioctx = dm_per_bio_data(bio, sizeof(struct dmz_bioctx));
+ sector_t sector;
+ struct bio *clone;
+
+ /* BIO remap sector */
+ sector = dmz_start_sect(dmz->metadata, zone) + dmz_blk2sect(chunk_block);
+
+ /* If the read is not partial, there is no need to clone the BIO */
+ if (nr_blocks == dmz_bio_blocks(bio)) {
+ /* Setup and submit the BIO */
+ bio->bi_iter.bi_sector = sector;
+ atomic_inc(&bioctx->ref);
+ generic_make_request(bio);
+ return 0;
+ }
+
+ /* Partial BIO: we need to clone the BIO */
+ clone = bio_clone_fast(bio, GFP_NOIO, dmz->bio_set);
+ if (!clone)
+ return -ENOMEM;
+
+ /* Setup the clone */
+ clone->bi_iter.bi_sector = sector;
+ clone->bi_iter.bi_size = dmz_blk2sect(nr_blocks) << SECTOR_SHIFT;
+ clone->bi_end_io = dmz_read_bio_end_io;
+ clone->bi_private = bioctx;
+
+ bio_advance(bio, clone->bi_iter.bi_size);
+
+ /* Submit the clone */
+ atomic_inc(&bioctx->ref);
+ generic_make_request(clone);
+
+ return 0;
+}
+
+/*
+ * Zero out pages of discarded blocks accessed by a read BIO.
+ */
+static void dmz_handle_read_zero(struct dmz_target *dmz, struct bio *bio,
+ sector_t chunk_block, unsigned int nr_blocks)
+{
+ unsigned int size = nr_blocks << DMZ_BLOCK_SHIFT;
+
+ /* Clear nr_blocks */
+ swap(bio->bi_iter.bi_size, size);
+ zero_fill_bio(bio);
+ swap(bio->bi_iter.bi_size, size);
+
+ bio_advance(bio, size);
+}
+
+/*
+ * Process a read BIO.
+ */
+static int dmz_handle_read(struct dmz_target *dmz, struct dm_zone *zone,
+ struct bio *bio)
+{
+ sector_t chunk_block = dmz_chunk_block(dmz->dev, dmz_bio_block(bio));
+ unsigned int nr_blocks = dmz_bio_blocks(bio);
+ sector_t end_block = chunk_block + nr_blocks;
+ struct dm_zone *rzone, *bzone;
+ int ret;
+
+ /* Read into unmapped chunks need only zeroing the BIO buffer */
+ if (!zone) {
+ zero_fill_bio(bio);
+ return 0;
+ }
+
+ dmz_dev_debug(dmz->dev, "READ chunk %llu -> %s zone %u, block %llu, %u blocks",
+ (unsigned long long)dmz_bio_chunk(dmz->dev, bio),
+ (dmz_is_rnd(zone) ? "RND" : "SEQ"),
+ dmz_id(dmz->metadata, zone),
+ (unsigned long long)chunk_block, nr_blocks);
+
+ /* Check block validity to determine the read location */
+ bzone = zone->bzone;
+ while (chunk_block < end_block) {
+ nr_blocks = 0;
+ if (dmz_is_rnd(zone) || chunk_block < zone->wp_block) {
+ /* Test block validity in the data zone */
+ ret = dmz_block_valid(dmz->metadata, zone, chunk_block);
+ if (ret < 0)
+ return ret;
+ if (ret > 0) {
+ /* Read data zone blocks */
+ nr_blocks = ret;
+ rzone = zone;
+ }
+ }
+
+ /*
+ * No valid blocks found in the data zone.
+ * Check the buffer zone, if there is one.
+ */
+ if (!nr_blocks && bzone) {
+ ret = dmz_block_valid(dmz->metadata, bzone, chunk_block);
+ if (ret < 0)
+ return ret;
+ if (ret > 0) {
+ /* Read buffer zone blocks */
+ nr_blocks = ret;
+ rzone = bzone;
+ }
+ }
+
+ if (nr_blocks) {
+ /* Valid blocks found: read them */
+ nr_blocks = min_t(unsigned int, nr_blocks, end_block - chunk_block);
+ ret = dmz_submit_read_bio(dmz, rzone, bio, chunk_block, nr_blocks);
+ if (ret)
+ return ret;
+ chunk_block += nr_blocks;
+ } else {
+ /* No valid block: zeroout the current BIO block */
+ dmz_handle_read_zero(dmz, bio, chunk_block, 1);
+ chunk_block++;
+ }
+ }
+
+ return 0;
+}
+
+/*
+ * Issue a write BIO to a zone.
+ */
+static void dmz_submit_write_bio(struct dmz_target *dmz, struct dm_zone *zone,
+ struct bio *bio, sector_t chunk_block,
+ unsigned int nr_blocks)
+{
+ struct dmz_bioctx *bioctx = dm_per_bio_data(bio, sizeof(struct dmz_bioctx));
+
+ /* Setup and submit the BIO */
+ bio->bi_bdev = dmz->dev->bdev;
+ bio->bi_iter.bi_sector = dmz_start_sect(dmz->metadata, zone) + dmz_blk2sect(chunk_block);
+ atomic_inc(&bioctx->ref);
+ generic_make_request(bio);
+
+ if (dmz_is_seq(zone))
+ zone->wp_block += nr_blocks;
+}
+
+/*
+ * Write blocks directly in a data zone, at the write pointer.
+ * If a buffer zone is assigned, invalidate the blocks written
+ * in place.
+ */
+static int dmz_handle_direct_write(struct dmz_target *dmz,
+ struct dm_zone *zone, struct bio *bio,
+ sector_t chunk_block,
+ unsigned int nr_blocks)
+{
+ struct dmz_metadata *zmd = dmz->metadata;
+ struct dm_zone *bzone = zone->bzone;
+ int ret;
+
+ if (dmz_is_readonly(zone))
+ return -EROFS;
+
+ /* Submit write */
+ dmz_submit_write_bio(dmz, zone, bio, chunk_block, nr_blocks);
+
+ /*
+ * Validate the blocks in the data zone and invalidate
+ * in the buffer zone, if there is one.
+ */
+ ret = dmz_validate_blocks(zmd, zone, chunk_block, nr_blocks);
+ if (ret == 0 && bzone)
+ ret = dmz_invalidate_blocks(zmd, bzone, chunk_block, nr_blocks);
+
+ return ret;
+}
+
+/*
+ * Write blocks in the buffer zone of @zone.
+ * If no buffer zone is assigned yet, get one.
+ * Called with @zone write locked.
+ */
+static int dmz_handle_buffered_write(struct dmz_target *dmz,
+ struct dm_zone *zone, struct bio *bio,
+ sector_t chunk_block,
+ unsigned int nr_blocks)
+{
+ struct dmz_metadata *zmd = dmz->metadata;
+ struct dm_zone *bzone;
+ int ret;
+
+ /* Get the buffer zone. One will be allocated if needed */
+ bzone = dmz_get_chunk_buffer(zmd, zone);
+ if (!bzone)
+ return -ENOSPC;
+
+ if (dmz_is_readonly(bzone))
+ return -EROFS;
+
+ /* Submit write */
+ dmz_submit_write_bio(dmz, bzone, bio, chunk_block, nr_blocks);
+
+ /*
+ * Validate the blocks in the buffer zone
+ * and invalidate in the data zone.
+ */
+ ret = dmz_validate_blocks(zmd, bzone, chunk_block, nr_blocks);
+ if (ret == 0 && chunk_block < zone->wp_block)
+ ret = dmz_invalidate_blocks(zmd, zone, chunk_block, nr_blocks);
+
+ return ret;
+}
+
+/*
+ * Process a write BIO.
+ */
+static int dmz_handle_write(struct dmz_target *dmz, struct dm_zone *zone,
+ struct bio *bio)
+{
+ sector_t chunk_block = dmz_chunk_block(dmz->dev, dmz_bio_block(bio));
+ unsigned int nr_blocks = dmz_bio_blocks(bio);
+
+ if (!zone)
+ return -ENOSPC;
+
+ dmz_dev_debug(dmz->dev, "WRITE chunk %llu -> %s zone %u, block %llu, %u blocks",
+ (unsigned long long)dmz_bio_chunk(dmz->dev, bio),
+ (dmz_is_rnd(zone) ? "RND" : "SEQ"),
+ dmz_id(dmz->metadata, zone),
+ (unsigned long long)chunk_block, nr_blocks);
+
+ if (dmz_is_rnd(zone) || chunk_block == zone->wp_block) {
+ /*
+ * zone is a random zone or it is a sequential zone
+ * and the BIO is aligned to the zone write pointer:
+ * direct write the zone.
+ */
+ return dmz_handle_direct_write(dmz, zone, bio, chunk_block, nr_blocks);
+ }
+
+ /*
+ * This is an unaligned write in a sequential zone:
+ * use buffered write.
+ */
+ return dmz_handle_buffered_write(dmz, zone, bio, chunk_block, nr_blocks);
+}
+
+/*
+ * Process a discard BIO.
+ */
+static int dmz_handle_discard(struct dmz_target *dmz, struct dm_zone *zone,
+ struct bio *bio)
+{
+ struct dmz_metadata *zmd = dmz->metadata;
+ sector_t block = dmz_bio_block(bio);
+ unsigned int nr_blocks = dmz_bio_blocks(bio);
+ sector_t chunk_block = dmz_chunk_block(dmz->dev, block);
+ int ret = 0;
+
+ /* For unmapped chunks, there is nothing to do */
+ if (!zone)
+ return 0;
+
+ if (dmz_is_readonly(zone))
+ return -EROFS;
+
+ dmz_dev_debug(dmz->dev, "DISCARD chunk %llu -> zone %u, block %llu, %u blocks",
+ (unsigned long long)dmz_bio_chunk(dmz->dev, bio),
+ dmz_id(zmd, zone),
+ (unsigned long long)chunk_block, nr_blocks);
+
+ /*
+ * Invalidate blocks in the data zone and its
+ * buffer zone if one is mapped.
+ */
+ if (dmz_is_rnd(zone) || chunk_block < zone->wp_block)
+ ret = dmz_invalidate_blocks(zmd, zone, chunk_block, nr_blocks);
+ if (ret == 0 && zone->bzone)
+ ret = dmz_invalidate_blocks(zmd, zone->bzone,
+ chunk_block, nr_blocks);
+ return ret;
+}
+
+/*
+ * Process a BIO.
+ */
+static void dmz_handle_bio(struct dmz_target *dmz, struct dm_chunk_work *cw,
+ struct bio *bio)
+{
+ struct dmz_bioctx *bioctx = dm_per_bio_data(bio, sizeof(struct dmz_bioctx));
+ struct dmz_metadata *zmd = dmz->metadata;
+ struct dm_zone *zone;
+ int ret;
+
+ /*
+ * Write may trigger a zone allocation. So make sure the
+ * allocation can succeed.
+ */
+ if (bio_op(bio) == REQ_OP_WRITE)
+ dmz_schedule_reclaim(dmz->reclaim);
+
+ dmz_lock_metadata(zmd);
+
+ /*
+ * Get the data zone mapping the chunk. There may be no
+ * mapping for read and discard. If a mapping is obtained,
+ + the zone returned will be set to active state.
+ */
+ zone = dmz_get_chunk_mapping(zmd, dmz_bio_chunk(dmz->dev, bio),
+ bio_op(bio));
+ if (IS_ERR(zone)) {
+ ret = PTR_ERR(zone);
+ goto out;
+ }
+
+ /* Process the BIO */
+ if (zone) {
+ dmz_activate_zone(zone);
+ bioctx->zone = zone;
+ }
+
+ switch (bio_op(bio)) {
+ case REQ_OP_READ:
+ ret = dmz_handle_read(dmz, zone, bio);
+ break;
+ case REQ_OP_WRITE:
+ ret = dmz_handle_write(dmz, zone, bio);
+ break;
+ case REQ_OP_DISCARD:
+ case REQ_OP_WRITE_ZEROES:
+ ret = dmz_handle_discard(dmz, zone, bio);
+ break;
+ default:
+ dmz_dev_err(dmz->dev, "Unsupported BIO operation 0x%x",
+ bio_op(bio));
+ ret = -EIO;
+ }
+
+ /*
+ * Release the chunk mapping. This will check that the mapping
+ * is still valid, that is, that the zone used still has valid blocks.
+ */
+ if (zone)
+ dmz_put_chunk_mapping(zmd, zone);
+out:
+ dmz_bio_endio(bio, errno_to_blk_status(ret));
+
+ dmz_unlock_metadata(zmd);
+}
+
+/*
+ * Increment a chunk reference counter.
+ */
+static inline void dmz_get_chunk_work(struct dm_chunk_work *cw)
+{
+ atomic_inc(&cw->refcount);
+}
+
+/*
+ * Decrement a chunk work reference count and
+ * free it if it becomes 0.
+ */
+static void dmz_put_chunk_work(struct dm_chunk_work *cw)
+{
+ if (atomic_dec_and_test(&cw->refcount)) {
+ WARN_ON(!bio_list_empty(&cw->bio_list));
+ radix_tree_delete(&cw->target->chunk_rxtree, cw->chunk);
+ kfree(cw);
+ }
+}
+
+/*
+ * Chunk BIO work function.
+ */
+static void dmz_chunk_work(struct work_struct *work)
+{
+ struct dm_chunk_work *cw = container_of(work, struct dm_chunk_work, work);
+ struct dmz_target *dmz = cw->target;
+ struct bio *bio;
+
+ mutex_lock(&dmz->chunk_lock);
+
+ /* Process the chunk BIOs */
+ while ((bio = bio_list_pop(&cw->bio_list))) {
+ mutex_unlock(&dmz->chunk_lock);
+ dmz_handle_bio(dmz, cw, bio);
+ mutex_lock(&dmz->chunk_lock);
+ dmz_put_chunk_work(cw);
+ }
+
+ /* Queueing the work incremented the work refcount */
+ dmz_put_chunk_work(cw);
+
+ mutex_unlock(&dmz->chunk_lock);
+}
+
+/*
+ * Flush work.
+ */
+static void dmz_flush_work(struct work_struct *work)
+{
+ struct dmz_target *dmz = container_of(work, struct dmz_target, flush_work.work);
+ struct bio *bio;
+ int ret;
+
+ /* Flush dirty metadata blocks */
+ ret = dmz_flush_metadata(dmz->metadata);
+
+ /* Process queued flush requests */
+ while (1) {
+ spin_lock(&dmz->flush_lock);
+ bio = bio_list_pop(&dmz->flush_list);
+ spin_unlock(&dmz->flush_lock);
+
+ if (!bio)
+ break;
+
+ dmz_bio_endio(bio, errno_to_blk_status(ret));
+ }
+
+ queue_delayed_work(dmz->flush_wq, &dmz->flush_work, DMZ_FLUSH_PERIOD);
+}
+
+/*
+ * Get a chunk work and start it to process a new BIO.
+ * If the BIO chunk has no work yet, create one.
+ */
+static void dmz_queue_chunk_work(struct dmz_target *dmz, struct bio *bio)
+{
+ unsigned int chunk = dmz_bio_chunk(dmz->dev, bio);
+ struct dm_chunk_work *cw;
+
+ mutex_lock(&dmz->chunk_lock);
+
+ /* Get the BIO chunk work. If one is not active yet, create one */
+ cw = radix_tree_lookup(&dmz->chunk_rxtree, chunk);
+ if (!cw) {
+ int ret;
+
+ /* Create a new chunk work */
+ cw = kmalloc(sizeof(struct dm_chunk_work), GFP_NOFS);
+ if (!cw)
+ goto out;
+
+ INIT_WORK(&cw->work, dmz_chunk_work);
+ atomic_set(&cw->refcount, 0);
+ cw->target = dmz;
+ cw->chunk = chunk;
+ bio_list_init(&cw->bio_list);
+
+ ret = radix_tree_insert(&dmz->chunk_rxtree, chunk, cw);
+ if (unlikely(ret)) {
+ kfree(cw);
+ cw = NULL;
+ goto out;
+ }
+ }
+
+ bio_list_add(&cw->bio_list, bio);
+ dmz_get_chunk_work(cw);
+
+ if (queue_work(dmz->chunk_wq, &cw->work))
+ dmz_get_chunk_work(cw);
+out:
+ mutex_unlock(&dmz->chunk_lock);
+}
+
+/*
+ * Process a new BIO.
+ */
+static int dmz_map(struct dm_target *ti, struct bio *bio)
+{
+ struct dmz_target *dmz = ti->private;
+ struct dmz_dev *dev = dmz->dev;
+ struct dmz_bioctx *bioctx = dm_per_bio_data(bio, sizeof(struct dmz_bioctx));
+ sector_t sector = bio->bi_iter.bi_sector;
+ unsigned int nr_sectors = bio_sectors(bio);
+ sector_t chunk_sector;
+
+ dmz_dev_debug(dev, "BIO op %d sector %llu + %u => chunk %llu, block %llu, %u blocks",
+ bio_op(bio), (unsigned long long)sector, nr_sectors,
+ (unsigned long long)dmz_bio_chunk(dmz->dev, bio),
+ (unsigned long long)dmz_chunk_block(dmz->dev, dmz_bio_block(bio)),
+ (unsigned int)dmz_bio_blocks(bio));
+
+ bio->bi_bdev = dev->bdev;
+
+ if (!nr_sectors && (bio_op(bio) != REQ_OP_FLUSH) && (bio_op(bio) != REQ_OP_WRITE))
+ return DM_MAPIO_REMAPPED;
+
+ /* The BIO should be block aligned */
+ if ((nr_sectors & DMZ_BLOCK_SECTORS_MASK) || (sector & DMZ_BLOCK_SECTORS_MASK))
+ return DM_MAPIO_KILL;
+
+ /* Initialize the BIO context */
+ bioctx->target = dmz;
+ bioctx->zone = NULL;
+ bioctx->bio = bio;
+ atomic_set(&bioctx->ref, 1);
+ bioctx->status = BLK_STS_OK;
+
+ /* Set the BIO pending in the flush list */
+ if (bio_op(bio) == REQ_OP_FLUSH || (!nr_sectors && bio_op(bio) == REQ_OP_WRITE)) {
+ spin_lock(&dmz->flush_lock);
+ bio_list_add(&dmz->flush_list, bio);
+ spin_unlock(&dmz->flush_lock);
+ mod_delayed_work(dmz->flush_wq, &dmz->flush_work, 0);
+ return DM_MAPIO_SUBMITTED;
+ }
+
+ /* Split zone BIOs to fit entirely into a zone */
+ chunk_sector = sector & (dev->zone_nr_sectors - 1);
+ if (chunk_sector + nr_sectors > dev->zone_nr_sectors)
+ dm_accept_partial_bio(bio, dev->zone_nr_sectors - chunk_sector);
+
+ /* Now ready to handle this BIO */
+ dmz_reclaim_bio_acc(dmz->reclaim);
+ dmz_queue_chunk_work(dmz, bio);
+
+ return DM_MAPIO_SUBMITTED;
+}
+
+/*
+ * Completed target BIO processing.
+ */
+static int dmz_end_io(struct dm_target *ti, struct bio *bio, blk_status_t *error)
+{
+ struct dmz_bioctx *bioctx = dm_per_bio_data(bio, sizeof(struct dmz_bioctx));
+
+ if (bioctx->status == BLK_STS_OK && *error)
+ bioctx->status = *error;
+
+ if (!atomic_dec_and_test(&bioctx->ref))
+ return DM_ENDIO_INCOMPLETE;
+
+ /* Done */
+ bio->bi_status = bioctx->status;
+
+ if (bioctx->zone) {
+ struct dm_zone *zone = bioctx->zone;
+
+ if (*error && bio_op(bio) == REQ_OP_WRITE) {
+ if (dmz_is_seq(zone))
+ set_bit(DMZ_SEQ_WRITE_ERR, &zone->flags);
+ }
+ dmz_deactivate_zone(zone);
+ }
+
+ return DM_ENDIO_DONE;
+}
+
+/*
+ * Get zoned device information.
+ */
+static int dmz_get_zoned_device(struct dm_target *ti, char *path)
+{
+ struct dmz_target *dmz = ti->private;
+ struct request_queue *q;
+ struct dmz_dev *dev;
+ int ret;
+
+ /* Get the target device */
+ ret = dm_get_device(ti, path, dm_table_get_mode(ti->table), &dmz->ddev);
+ if (ret) {
+ ti->error = "Get target device failed";
+ dmz->ddev = NULL;
+ return ret;
+ }
+
+ dev = kzalloc(sizeof(struct dmz_dev), GFP_KERNEL);
+ if (!dev) {
+ ret = -ENOMEM;
+ goto err;
+ }
+
+ dev->bdev = dmz->ddev->bdev;
+ (void)bdevname(dev->bdev, dev->name);
+
+ if (bdev_zoned_model(dev->bdev) == BLK_ZONED_NONE) {
+ ti->error = "Not a zoned block device";
+ ret = -EINVAL;
+ goto err;
+ }
+
+ dev->capacity = i_size_read(dev->bdev->bd_inode) >> SECTOR_SHIFT;
+ if (ti->begin || (ti->len != dev->capacity)) {
+ ti->error = "Partial mapping not supported";
+ ret = -EINVAL;
+ goto err;
+ }
+
+ q = bdev_get_queue(dev->bdev);
+ dev->zone_nr_sectors = q->limits.chunk_sectors;
+ dev->zone_nr_sectors_shift = ilog2(dev->zone_nr_sectors);
+
+ dev->zone_nr_blocks = dmz_sect2blk(dev->zone_nr_sectors);
+ dev->zone_nr_blocks_shift = ilog2(dev->zone_nr_blocks);
+
+ dev->nr_zones = (dev->capacity + dev->zone_nr_sectors - 1)
+ >> dev->zone_nr_sectors_shift;
+
+ dmz->dev = dev;
+
+ return 0;
+err:
+ dm_put_device(ti, dmz->ddev);
+ kfree(dev);
+
+ return ret;
+}
+
+/*
+ * Cleanup zoned device information.
+ */
+static void dmz_put_zoned_device(struct dm_target *ti)
+{
+ struct dmz_target *dmz = ti->private;
+
+ dm_put_device(ti, dmz->ddev);
+ kfree(dmz->dev);
+ dmz->dev = NULL;
+}
+
+/*
+ * Setup target.
+ */
+static int dmz_ctr(struct dm_target *ti, unsigned int argc, char **argv)
+{
+ struct dmz_target *dmz;
+ struct dmz_dev *dev;
+ int ret;
+
+ /* Check arguments */
+ if (argc != 1) {
+ ti->error = "Invalid argument count";
+ return -EINVAL;
+ }
+
+ /* Allocate and initialize the target descriptor */
+ dmz = kzalloc(sizeof(struct dmz_target), GFP_KERNEL);
+ if (!dmz) {
+ ti->error = "Unable to allocate the zoned target descriptor";
+ return -ENOMEM;
+ }
+ ti->private = dmz;
+
+ /* Get the target zoned block device */
+ ret = dmz_get_zoned_device(ti, argv[0]);
+ if (ret) {
+ dmz->ddev = NULL;
+ goto err;
+ }
+
+ /* Initialize metadata */
+ dev = dmz->dev;
+ ret = dmz_ctr_metadata(dev, &dmz->metadata);
+ if (ret) {
+ ti->error = "Metadata initialization failed";
+ goto err_dev;
+ }
+
+ /* Set target (no write same support) */
+ ti->max_io_len = dev->zone_nr_sectors << 9;
+ ti->num_flush_bios = 1;
+ ti->num_discard_bios = 1;
+ ti->num_write_zeroes_bios = 1;
+ ti->per_io_data_size = sizeof(struct dmz_bioctx);
+ ti->flush_supported = true;
+ ti->discards_supported = true;
+ ti->split_discard_bios = true;
+
+ /* The exposed capacity is the number of chunks that can be mapped */
+ ti->len = (sector_t)dmz_nr_chunks(dmz->metadata) << dev->zone_nr_sectors_shift;
+
+ /* Zone BIO */
+ dmz->bio_set = bioset_create(DMZ_MIN_BIOS, 0, 0);
+ if (!dmz->bio_set) {
+ ti->error = "Create BIO set failed";
+ ret = -ENOMEM;
+ goto err_meta;
+ }
+
+ /* Chunk BIO work */
+ mutex_init(&dmz->chunk_lock);
+ INIT_RADIX_TREE(&dmz->chunk_rxtree, GFP_NOFS);
+ dmz->chunk_wq = alloc_workqueue("dmz_cwq_%s", WQ_MEM_RECLAIM | WQ_UNBOUND,
+ 0, dev->name);
+ if (!dmz->chunk_wq) {
+ ti->error = "Create chunk workqueue failed";
+ ret = -ENOMEM;
+ goto err_bio;
+ }
+
+ /* Flush work */
+ spin_lock_init(&dmz->flush_lock);
+ bio_list_init(&dmz->flush_list);
+ INIT_DELAYED_WORK(&dmz->flush_work, dmz_flush_work);
+ dmz->flush_wq = alloc_ordered_workqueue("dmz_fwq_%s", WQ_MEM_RECLAIM,
+ dev->name);
+ if (!dmz->flush_wq) {
+ ti->error = "Create flush workqueue failed";
+ ret = -ENOMEM;
+ goto err_cwq;
+ }
+ mod_delayed_work(dmz->flush_wq, &dmz->flush_work, DMZ_FLUSH_PERIOD);
+
+ /* Initialize reclaim */
+ ret = dmz_ctr_reclaim(dev, dmz->metadata, &dmz->reclaim);
+ if (ret) {
+ ti->error = "Zone reclaim initialization failed";
+ goto err_fwq;
+ }
+
+ dmz_dev_info(dev, "Target device: %llu 512-byte logical sectors (%llu blocks)",
+ (unsigned long long)ti->len,
+ (unsigned long long)dmz_sect2blk(ti->len));
+
+ return 0;
+err_fwq:
+ destroy_workqueue(dmz->flush_wq);
+err_cwq:
+ destroy_workqueue(dmz->chunk_wq);
+err_bio:
+ bioset_free(dmz->bio_set);
+err_meta:
+ dmz_dtr_metadata(dmz->metadata);
+err_dev:
+ dmz_put_zoned_device(ti);
+err:
+ kfree(dmz);
+
+ return ret;
+}
+
+/*
+ * Cleanup target.
+ */
+static void dmz_dtr(struct dm_target *ti)
+{
+ struct dmz_target *dmz = ti->private;
+
+ flush_workqueue(dmz->chunk_wq);
+ destroy_workqueue(dmz->chunk_wq);
+
+ dmz_dtr_reclaim(dmz->reclaim);
+
+ cancel_delayed_work_sync(&dmz->flush_work);
+ destroy_workqueue(dmz->flush_wq);
+
+ (void) dmz_flush_metadata(dmz->metadata);
+
+ dmz_dtr_metadata(dmz->metadata);
+
+ bioset_free(dmz->bio_set);
+
+ dmz_put_zoned_device(ti);
+
+ kfree(dmz);
+}
+
+/*
+ * Setup target request queue limits.
+ */
+static void dmz_io_hints(struct dm_target *ti, struct queue_limits *limits)
+{
+ struct dmz_target *dmz = ti->private;
+ unsigned int chunk_sectors = dmz->dev->zone_nr_sectors;
+
+ limits->logical_block_size = DMZ_BLOCK_SIZE;
+ limits->physical_block_size = DMZ_BLOCK_SIZE;
+
+ blk_limits_io_min(limits, DMZ_BLOCK_SIZE);
+ blk_limits_io_opt(limits, DMZ_BLOCK_SIZE);
+
+ limits->discard_alignment = DMZ_BLOCK_SIZE;
+ limits->discard_granularity = DMZ_BLOCK_SIZE;
+ limits->max_discard_sectors = chunk_sectors;
+ limits->max_hw_discard_sectors = chunk_sectors;
+ limits->max_write_zeroes_sectors = chunk_sectors;
+
+ /* FS hint to try to align to the device zone size */
+ limits->chunk_sectors = chunk_sectors;
+ limits->max_sectors = chunk_sectors;
+
+ /* We are exposing a drive-managed zoned block device */
+ limits->zoned = BLK_ZONED_NONE;
+}
+
+/*
+ * Pass on ioctl to the backend device.
+ */
+static int dmz_prepare_ioctl(struct dm_target *ti,
+ struct block_device **bdev, fmode_t *mode)
+{
+ struct dmz_target *dmz = ti->private;
+
+ *bdev = dmz->dev->bdev;
+
+ return 0;
+}
+
+/*
+ * Stop works on suspend.
+ */
+static void dmz_suspend(struct dm_target *ti)
+{
+ struct dmz_target *dmz = ti->private;
+
+ flush_workqueue(dmz->chunk_wq);
+ dmz_suspend_reclaim(dmz->reclaim);
+ cancel_delayed_work_sync(&dmz->flush_work);
+}
+
+/*
+ * Restart works on resume or if suspend failed.
+ */
+static void dmz_resume(struct dm_target *ti)
+{
+ struct dmz_target *dmz = ti->private;
+
+ queue_delayed_work(dmz->flush_wq, &dmz->flush_work, DMZ_FLUSH_PERIOD);
+ dmz_resume_reclaim(dmz->reclaim);
+}
+
+static int dmz_iterate_devices(struct dm_target *ti,
+ iterate_devices_callout_fn fn, void *data)
+{
+ struct dmz_target *dmz = ti->private;
+
+ return fn(ti, dmz->ddev, 0, dmz->dev->capacity, data);
+}
+
+static struct target_type dmz_type = {
+ .name = "zoned",
+ .version = {1, 0, 0},
+ .features = DM_TARGET_SINGLETON | DM_TARGET_ZONED_HM,
+ .module = THIS_MODULE,
+ .ctr = dmz_ctr,
+ .dtr = dmz_dtr,
+ .map = dmz_map,
+ .end_io = dmz_end_io,
+ .io_hints = dmz_io_hints,
+ .prepare_ioctl = dmz_prepare_ioctl,
+ .postsuspend = dmz_suspend,
+ .resume = dmz_resume,
+ .iterate_devices = dmz_iterate_devices,
+};
+
+static int __init dmz_init(void)
+{
+ return dm_register_target(&dmz_type);
+}
+
+static void __exit dmz_exit(void)
+{
+ dm_unregister_target(&dmz_type);
+}
+
+module_init(dmz_init);
+module_exit(dmz_exit);
+
+MODULE_DESCRIPTION(DM_NAME " target for zoned block devices");
+MODULE_AUTHOR("Damien Le Moal <[email protected]>");
+MODULE_LICENSE("GPL");
diff --git a/drivers/md/dm-zoned.h b/drivers/md/dm-zoned.h
new file mode 100644
index 000000000000..12419f0bfe78
--- /dev/null
+++ b/drivers/md/dm-zoned.h
@@ -0,0 +1,228 @@
+/*
+ * Copyright (C) 2017 Western Digital Corporation or its affiliates.
+ *
+ * This file is released under the GPL.
+ */
+
+#ifndef DM_ZONED_H
+#define DM_ZONED_H
+
+#include <linux/types.h>
+#include <linux/blkdev.h>
+#include <linux/device-mapper.h>
+#include <linux/dm-kcopyd.h>
+#include <linux/list.h>
+#include <linux/spinlock.h>
+#include <linux/mutex.h>
+#include <linux/workqueue.h>
+#include <linux/rwsem.h>
+#include <linux/rbtree.h>
+#include <linux/radix-tree.h>
+#include <linux/shrinker.h>
+
+/*
+ * dm-zoned creates block devices with 4KB blocks, always.
+ */
+#define DMZ_BLOCK_SHIFT 12
+#define DMZ_BLOCK_SIZE (1 << DMZ_BLOCK_SHIFT)
+#define DMZ_BLOCK_MASK (DMZ_BLOCK_SIZE - 1)
+
+#define DMZ_BLOCK_SHIFT_BITS (DMZ_BLOCK_SHIFT + 3)
+#define DMZ_BLOCK_SIZE_BITS (1 << DMZ_BLOCK_SHIFT_BITS)
+#define DMZ_BLOCK_MASK_BITS (DMZ_BLOCK_SIZE_BITS - 1)
+
+#define DMZ_BLOCK_SECTORS_SHIFT (DMZ_BLOCK_SHIFT - SECTOR_SHIFT)
+#define DMZ_BLOCK_SECTORS (DMZ_BLOCK_SIZE >> SECTOR_SHIFT)
+#define DMZ_BLOCK_SECTORS_MASK (DMZ_BLOCK_SECTORS - 1)
+
+/*
+ * 4KB block <-> 512B sector conversion.
+ */
+#define dmz_blk2sect(b) ((sector_t)(b) << DMZ_BLOCK_SECTORS_SHIFT)
+#define dmz_sect2blk(s) ((sector_t)(s) >> DMZ_BLOCK_SECTORS_SHIFT)
+
+#define dmz_bio_block(bio) dmz_sect2blk((bio)->bi_iter.bi_sector)
+#define dmz_bio_blocks(bio) dmz_sect2blk(bio_sectors(bio))
+
+/*
+ * Zoned block device information.
+ */
+struct dmz_dev {
+ struct block_device *bdev;
+
+ char name[BDEVNAME_SIZE];
+
+ sector_t capacity;
+
+ unsigned int nr_zones;
+
+ sector_t zone_nr_sectors;
+ unsigned int zone_nr_sectors_shift;
+
+ sector_t zone_nr_blocks;
+ sector_t zone_nr_blocks_shift;
+};
+
+#define dmz_bio_chunk(dev, bio) ((bio)->bi_iter.bi_sector >> \
+ (dev)->zone_nr_sectors_shift)
+#define dmz_chunk_block(dev, b) ((b) & ((dev)->zone_nr_blocks - 1))
+
+/*
+ * Zone descriptor.
+ */
+struct dm_zone {
+ /* For listing the zone depending on its state */
+ struct list_head link;
+
+ /* Zone type and state */
+ unsigned long flags;
+
+ /* Zone activation reference count */
+ atomic_t refcount;
+
+ /* Zone write pointer block (relative to the zone start block) */
+ unsigned int wp_block;
+
+ /* Zone weight (number of valid blocks in the zone) */
+ unsigned int weight;
+
+ /* The chunk that the zone maps */
+ unsigned int chunk;
+
+ /*
+ * For a sequential data zone, pointer to the random zone
+ * used as a buffer for processing unaligned writes.
+ * For a buffer zone, this points back to the data zone.
+ */
+ struct dm_zone *bzone;
+};
+
+/*
+ * Zone flags.
+ */
+enum {
+ /* Zone write type */
+ DMZ_RND,
+ DMZ_SEQ,
+
+ /* Zone critical condition */
+ DMZ_OFFLINE,
+ DMZ_READ_ONLY,
+
+ /* How the zone is being used */
+ DMZ_META,
+ DMZ_DATA,
+ DMZ_BUF,
+
+ /* Zone internal state */
+ DMZ_ACTIVE,
+ DMZ_RECLAIM,
+ DMZ_SEQ_WRITE_ERR,
+};
+
+/*
+ * Zone data accessors.
+ */
+#define dmz_is_rnd(z) test_bit(DMZ_RND, &(z)->flags)
+#define dmz_is_seq(z) test_bit(DMZ_SEQ, &(z)->flags)
+#define dmz_is_empty(z) ((z)->wp_block == 0)
+#define dmz_is_offline(z) test_bit(DMZ_OFFLINE, &(z)->flags)
+#define dmz_is_readonly(z) test_bit(DMZ_READ_ONLY, &(z)->flags)
+#define dmz_is_active(z) test_bit(DMZ_ACTIVE, &(z)->flags)
+#define dmz_in_reclaim(z) test_bit(DMZ_RECLAIM, &(z)->flags)
+#define dmz_seq_write_err(z) test_bit(DMZ_SEQ_WRITE_ERR, &(z)->flags)
+
+#define dmz_is_meta(z) test_bit(DMZ_META, &(z)->flags)
+#define dmz_is_buf(z) test_bit(DMZ_BUF, &(z)->flags)
+#define dmz_is_data(z) test_bit(DMZ_DATA, &(z)->flags)
+
+#define dmz_weight(z) ((z)->weight)
+
+/*
+ * Message functions.
+ */
+#define dmz_dev_info(dev, format, args...) \
+ DMINFO("(%s): " format, (dev)->name, ## args)
+
+#define dmz_dev_err(dev, format, args...) \
+ DMERR("(%s): " format, (dev)->name, ## args)
+
+#define dmz_dev_warn(dev, format, args...) \
+ DMWARN("(%s): " format, (dev)->name, ## args)
+
+#define dmz_dev_debug(dev, format, args...) \
+ DMDEBUG("(%s): " format, (dev)->name, ## args)
+
+struct dmz_metadata;
+struct dmz_reclaim;
+
+/*
+ * Functions defined in dm-zoned-metadata.c
+ */
+int dmz_ctr_metadata(struct dmz_dev *dev, struct dmz_metadata **zmd);
+void dmz_dtr_metadata(struct dmz_metadata *zmd);
+int dmz_resume_metadata(struct dmz_metadata *zmd);
+
+void dmz_lock_map(struct dmz_metadata *zmd);
+void dmz_unlock_map(struct dmz_metadata *zmd);
+void dmz_lock_metadata(struct dmz_metadata *zmd);
+void dmz_unlock_metadata(struct dmz_metadata *zmd);
+void dmz_lock_flush(struct dmz_metadata *zmd);
+void dmz_unlock_flush(struct dmz_metadata *zmd);
+int dmz_flush_metadata(struct dmz_metadata *zmd);
+
+unsigned int dmz_id(struct dmz_metadata *zmd, struct dm_zone *zone);
+sector_t dmz_start_sect(struct dmz_metadata *zmd, struct dm_zone *zone);
+sector_t dmz_start_block(struct dmz_metadata *zmd, struct dm_zone *zone);
+unsigned int dmz_nr_chunks(struct dmz_metadata *zmd);
+
+#define DMZ_ALLOC_RND 0x01
+#define DMZ_ALLOC_RECLAIM 0x02
+
+struct dm_zone *dmz_alloc_zone(struct dmz_metadata *zmd, unsigned long flags);
+void dmz_free_zone(struct dmz_metadata *zmd, struct dm_zone *zone);
+
+void dmz_map_zone(struct dmz_metadata *zmd, struct dm_zone *zone,
+ unsigned int chunk);
+void dmz_unmap_zone(struct dmz_metadata *zmd, struct dm_zone *zone);
+unsigned int dmz_nr_rnd_zones(struct dmz_metadata *zmd);
+unsigned int dmz_nr_unmap_rnd_zones(struct dmz_metadata *zmd);
+
+void dmz_activate_zone(struct dm_zone *zone);
+void dmz_deactivate_zone(struct dm_zone *zone);
+
+int dmz_lock_zone_reclaim(struct dm_zone *zone);
+void dmz_unlock_zone_reclaim(struct dm_zone *zone);
+struct dm_zone *dmz_get_zone_for_reclaim(struct dmz_metadata *zmd);
+
+struct dm_zone *dmz_get_chunk_mapping(struct dmz_metadata *zmd,
+ unsigned int chunk, int op);
+void dmz_put_chunk_mapping(struct dmz_metadata *zmd, struct dm_zone *zone);
+struct dm_zone *dmz_get_chunk_buffer(struct dmz_metadata *zmd,
+ struct dm_zone *dzone);
+
+int dmz_validate_blocks(struct dmz_metadata *zmd, struct dm_zone *zone,
+ sector_t chunk_block, unsigned int nr_blocks);
+int dmz_invalidate_blocks(struct dmz_metadata *zmd, struct dm_zone *zone,
+ sector_t chunk_block, unsigned int nr_blocks);
+int dmz_block_valid(struct dmz_metadata *zmd, struct dm_zone *zone,
+ sector_t chunk_block);
+int dmz_first_valid_block(struct dmz_metadata *zmd, struct dm_zone *zone,
+ sector_t *chunk_block);
+int dmz_copy_valid_blocks(struct dmz_metadata *zmd, struct dm_zone *from_zone,
+ struct dm_zone *to_zone);
+int dmz_merge_valid_blocks(struct dmz_metadata *zmd, struct dm_zone *from_zone,
+ struct dm_zone *to_zone, sector_t chunk_block);
+
+/*
+ * Functions defined in dm-zoned-reclaim.c
+ */
+int dmz_ctr_reclaim(struct dmz_dev *dev, struct dmz_metadata *zmd,
+ struct dmz_reclaim **zrc);
+void dmz_dtr_reclaim(struct dmz_reclaim *zrc);
+void dmz_suspend_reclaim(struct dmz_reclaim *zrc);
+void dmz_resume_reclaim(struct dmz_reclaim *zrc);
+void dmz_reclaim_bio_acc(struct dmz_reclaim *zrc);
+void dmz_schedule_reclaim(struct dmz_reclaim *zrc);
+
+#endif /* DM_ZONED_H */
diff --git a/drivers/md/dm.c b/drivers/md/dm.c
index 3086da5664f3..2edbcc2d7d3f 100644
--- a/drivers/md/dm.c
+++ b/drivers/md/dm.c
@@ -12,11 +12,14 @@
#include <linux/init.h>
#include <linux/module.h>
#include <linux/mutex.h>
+#include <linux/sched/signal.h>
#include <linux/blkpg.h>
#include <linux/bio.h>
#include <linux/mempool.h>
+#include <linux/dax.h>
#include <linux/slab.h>
#include <linux/idr.h>
+#include <linux/uio.h>
#include <linux/hdreg.h>
#include <linux/delay.h>
#include <linux/wait.h>
@@ -56,12 +59,15 @@ static DECLARE_WORK(deferred_remove_work, do_deferred_remove);
static struct workqueue_struct *deferred_remove_workqueue;
+atomic_t dm_global_event_nr = ATOMIC_INIT(0);
+DECLARE_WAIT_QUEUE_HEAD(dm_global_eventq);
+
/*
* One of these is allocated per bio.
*/
struct dm_io {
struct mapped_device *md;
- int error;
+ blk_status_t status;
atomic_t io_count;
struct bio *bio;
unsigned long start_time;
@@ -91,7 +97,6 @@ static int dm_numa_node = DM_NUMA_NODE;
*/
struct dm_md_mempools {
mempool_t *io_pool;
- mempool_t *rq_pool;
struct bio_set *bs;
};
@@ -466,13 +471,16 @@ static int dm_blk_ioctl(struct block_device *bdev, fmode_t mode,
if (r > 0) {
/*
- * Target determined this ioctl is being issued against
- * a logical partition of the parent bdev; so extra
- * validation is needed.
+ * Target determined this ioctl is being issued against a
+ * subset of the parent bdev; require extra privileges.
*/
- r = scsi_verify_blk_ioctl(NULL, cmd);
- if (r)
+ if (!capable(CAP_SYS_RAWIO)) {
+ DMWARN_LIMIT(
+ "%s: sending ioctl %x to DM device without required privilege.",
+ current->comm, cmd);
+ r = -ENOIOCTLCMD;
goto out;
+ }
}
r = __blkdev_driver_ioctl(bdev, mode, cmd, arg);
@@ -626,6 +634,7 @@ static int open_table_device(struct table_device *td, dev_t dev,
}
td->dm_dev.bdev = bdev;
+ td->dm_dev.dax_dev = dax_get_by_host(bdev->bd_disk->disk_name);
return 0;
}
@@ -639,7 +648,9 @@ static void close_table_device(struct table_device *td, struct mapped_device *md
bd_unlink_disk_holder(td->dm_dev.bdev, dm_disk(md));
blkdev_put(td->dm_dev.bdev, td->dm_dev.mode | FMODE_EXCL);
+ put_dax(td->dm_dev.dax_dev);
td->dm_dev.bdev = NULL;
+ td->dm_dev.dax_dev = NULL;
}
static struct table_device *find_table_device(struct list_head *l, dev_t dev,
@@ -761,23 +772,24 @@ static int __noflush_suspending(struct mapped_device *md)
* Decrements the number of outstanding ios that a bio has been
* cloned into, completing the original io if necc.
*/
-static void dec_pending(struct dm_io *io, int error)
+static void dec_pending(struct dm_io *io, blk_status_t error)
{
unsigned long flags;
- int io_error;
+ blk_status_t io_error;
struct bio *bio;
struct mapped_device *md = io->md;
/* Push-back supersedes any I/O errors */
if (unlikely(error)) {
spin_lock_irqsave(&io->endio_lock, flags);
- if (!(io->error > 0 && __noflush_suspending(md)))
- io->error = error;
+ if (!(io->status == BLK_STS_DM_REQUEUE &&
+ __noflush_suspending(md)))
+ io->status = error;
spin_unlock_irqrestore(&io->endio_lock, flags);
}
if (atomic_dec_and_test(&io->io_count)) {
- if (io->error == DM_ENDIO_REQUEUE) {
+ if (io->status == BLK_STS_DM_REQUEUE) {
/*
* Target requested pushing back the I/O.
*/
@@ -786,16 +798,16 @@ static void dec_pending(struct dm_io *io, int error)
bio_list_add_head(&md->deferred, io->bio);
else
/* noflush suspend was interrupted. */
- io->error = -EIO;
+ io->status = BLK_STS_IOERR;
spin_unlock_irqrestore(&md->deferred_lock, flags);
}
- io_error = io->error;
+ io_error = io->status;
bio = io->bio;
end_io_acct(io);
free_io(md, io);
- if (io_error == DM_ENDIO_REQUEUE)
+ if (io_error == BLK_STS_DM_REQUEUE)
return;
if ((bio->bi_opf & REQ_PREFLUSH) && bio->bi_iter.bi_size) {
@@ -807,8 +819,7 @@ static void dec_pending(struct dm_io *io, int error)
queue_io(md, bio);
} else {
/* done with normal IO or empty flush */
- trace_block_bio_complete(md->queue, bio, io_error);
- bio->bi_error = io_error;
+ bio->bi_status = io_error;
bio_endio(bio);
}
}
@@ -822,36 +833,48 @@ void disable_write_same(struct mapped_device *md)
limits->max_write_same_sectors = 0;
}
+void disable_write_zeroes(struct mapped_device *md)
+{
+ struct queue_limits *limits = dm_get_queue_limits(md);
+
+ /* device doesn't really support WRITE ZEROES, disable it */
+ limits->max_write_zeroes_sectors = 0;
+}
+
static void clone_endio(struct bio *bio)
{
- int error = bio->bi_error;
- int r = error;
+ blk_status_t error = bio->bi_status;
struct dm_target_io *tio = container_of(bio, struct dm_target_io, clone);
struct dm_io *io = tio->io;
struct mapped_device *md = tio->io->md;
dm_endio_fn endio = tio->ti->type->end_io;
+ if (unlikely(error == BLK_STS_TARGET)) {
+ if (bio_op(bio) == REQ_OP_WRITE_SAME &&
+ !bdev_get_queue(bio->bi_bdev)->limits.max_write_same_sectors)
+ disable_write_same(md);
+ if (bio_op(bio) == REQ_OP_WRITE_ZEROES &&
+ !bdev_get_queue(bio->bi_bdev)->limits.max_write_zeroes_sectors)
+ disable_write_zeroes(md);
+ }
+
if (endio) {
- r = endio(tio->ti, bio, error);
- if (r < 0 || r == DM_ENDIO_REQUEUE)
- /*
- * error and requeue request are handled
- * in dec_pending().
- */
- error = r;
- else if (r == DM_ENDIO_INCOMPLETE)
+ int r = endio(tio->ti, bio, &error);
+ switch (r) {
+ case DM_ENDIO_REQUEUE:
+ error = BLK_STS_DM_REQUEUE;
+ /*FALLTHRU*/
+ case DM_ENDIO_DONE:
+ break;
+ case DM_ENDIO_INCOMPLETE:
/* The target will handle the io */
return;
- else if (r) {
+ default:
DMWARN("unimplemented target endio return value: %d", r);
BUG();
}
}
- if (unlikely(r == -EREMOTEIO && (bio_op(bio) == REQ_OP_WRITE_SAME) &&
- !bdev_get_queue(bio->bi_bdev)->limits.max_write_same_sectors))
- disable_write_same(md);
-
free_tio(tio);
dec_pending(io, error);
}
@@ -905,31 +928,91 @@ int dm_set_target_max_io_len(struct dm_target *ti, sector_t len)
}
EXPORT_SYMBOL_GPL(dm_set_target_max_io_len);
-static long dm_blk_direct_access(struct block_device *bdev, sector_t sector,
- void **kaddr, pfn_t *pfn, long size)
+static struct dm_target *dm_dax_get_live_target(struct mapped_device *md,
+ sector_t sector, int *srcu_idx)
{
- struct mapped_device *md = bdev->bd_disk->private_data;
struct dm_table *map;
struct dm_target *ti;
- int srcu_idx;
- long len, ret = -EIO;
- map = dm_get_live_table(md, &srcu_idx);
+ map = dm_get_live_table(md, srcu_idx);
if (!map)
- goto out;
+ return NULL;
ti = dm_table_find_target(map, sector);
if (!dm_target_is_valid(ti))
- goto out;
+ return NULL;
+
+ return ti;
+}
+
+static long dm_dax_direct_access(struct dax_device *dax_dev, pgoff_t pgoff,
+ long nr_pages, void **kaddr, pfn_t *pfn)
+{
+ struct mapped_device *md = dax_get_private(dax_dev);
+ sector_t sector = pgoff * PAGE_SECTORS;
+ struct dm_target *ti;
+ long len, ret = -EIO;
+ int srcu_idx;
- len = max_io_len(sector, ti) << SECTOR_SHIFT;
- size = min(len, size);
+ ti = dm_dax_get_live_target(md, sector, &srcu_idx);
+ if (!ti)
+ goto out;
+ if (!ti->type->direct_access)
+ goto out;
+ len = max_io_len(sector, ti) / PAGE_SECTORS;
+ if (len < 1)
+ goto out;
+ nr_pages = min(len, nr_pages);
if (ti->type->direct_access)
- ret = ti->type->direct_access(ti, sector, kaddr, pfn, size);
-out:
+ ret = ti->type->direct_access(ti, pgoff, nr_pages, kaddr, pfn);
+
+ out:
+ dm_put_live_table(md, srcu_idx);
+
+ return ret;
+}
+
+static size_t dm_dax_copy_from_iter(struct dax_device *dax_dev, pgoff_t pgoff,
+ void *addr, size_t bytes, struct iov_iter *i)
+{
+ struct mapped_device *md = dax_get_private(dax_dev);
+ sector_t sector = pgoff * PAGE_SECTORS;
+ struct dm_target *ti;
+ long ret = 0;
+ int srcu_idx;
+
+ ti = dm_dax_get_live_target(md, sector, &srcu_idx);
+
+ if (!ti)
+ goto out;
+ if (!ti->type->dax_copy_from_iter) {
+ ret = copy_from_iter(addr, bytes, i);
+ goto out;
+ }
+ ret = ti->type->dax_copy_from_iter(ti, pgoff, addr, bytes, i);
+ out:
+ dm_put_live_table(md, srcu_idx);
+
+ return ret;
+}
+
+static void dm_dax_flush(struct dax_device *dax_dev, pgoff_t pgoff, void *addr,
+ size_t size)
+{
+ struct mapped_device *md = dax_get_private(dax_dev);
+ sector_t sector = pgoff * PAGE_SECTORS;
+ struct dm_target *ti;
+ int srcu_idx;
+
+ ti = dm_dax_get_live_target(md, sector, &srcu_idx);
+
+ if (!ti)
+ goto out;
+ if (ti->type->dax_flush)
+ ti->type->dax_flush(ti, pgoff, addr, size);
+ out:
dm_put_live_table(md, srcu_idx);
- return min(ret, size);
}
/*
@@ -972,10 +1055,144 @@ void dm_accept_partial_bio(struct bio *bio, unsigned n_sectors)
}
EXPORT_SYMBOL_GPL(dm_accept_partial_bio);
+/*
+ * The zone descriptors obtained with a zone report indicate
+ * zone positions within the target device. The zone descriptors
+ * must be remapped to match their position within the dm device.
+ * A target may call dm_remap_zone_report after completion of a
+ * REQ_OP_ZONE_REPORT bio to remap the zone descriptors obtained
+ * from the target device mapping to the dm device.
+ */
+void dm_remap_zone_report(struct dm_target *ti, struct bio *bio, sector_t start)
+{
+#ifdef CONFIG_BLK_DEV_ZONED
+ struct dm_target_io *tio = container_of(bio, struct dm_target_io, clone);
+ struct bio *report_bio = tio->io->bio;
+ struct blk_zone_report_hdr *hdr = NULL;
+ struct blk_zone *zone;
+ unsigned int nr_rep = 0;
+ unsigned int ofst;
+ struct bio_vec bvec;
+ struct bvec_iter iter;
+ void *addr;
+
+ if (bio->bi_status)
+ return;
+
+ /*
+ * Remap the start sector of the reported zones. For sequential zones,
+ * also remap the write pointer position.
+ */
+ bio_for_each_segment(bvec, report_bio, iter) {
+ addr = kmap_atomic(bvec.bv_page);
+
+ /* Remember the report header in the first page */
+ if (!hdr) {
+ hdr = addr;
+ ofst = sizeof(struct blk_zone_report_hdr);
+ } else
+ ofst = 0;
+
+ /* Set zones start sector */
+ while (hdr->nr_zones && ofst < bvec.bv_len) {
+ zone = addr + ofst;
+ if (zone->start >= start + ti->len) {
+ hdr->nr_zones = 0;
+ break;
+ }
+ zone->start = zone->start + ti->begin - start;
+ if (zone->type != BLK_ZONE_TYPE_CONVENTIONAL) {
+ if (zone->cond == BLK_ZONE_COND_FULL)
+ zone->wp = zone->start + zone->len;
+ else if (zone->cond == BLK_ZONE_COND_EMPTY)
+ zone->wp = zone->start;
+ else
+ zone->wp = zone->wp + ti->begin - start;
+ }
+ ofst += sizeof(struct blk_zone);
+ hdr->nr_zones--;
+ nr_rep++;
+ }
+
+ if (addr != hdr)
+ kunmap_atomic(addr);
+
+ if (!hdr->nr_zones)
+ break;
+ }
+
+ if (hdr) {
+ hdr->nr_zones = nr_rep;
+ kunmap_atomic(hdr);
+ }
+
+ bio_advance(report_bio, report_bio->bi_iter.bi_size);
+
+#else /* !CONFIG_BLK_DEV_ZONED */
+ bio->bi_status = BLK_STS_NOTSUPP;
+#endif
+}
+EXPORT_SYMBOL_GPL(dm_remap_zone_report);
+
+/*
+ * Flush current->bio_list when the target map method blocks.
+ * This fixes deadlocks in snapshot and possibly in other targets.
+ */
+struct dm_offload {
+ struct blk_plug plug;
+ struct blk_plug_cb cb;
+};
+
+static void flush_current_bio_list(struct blk_plug_cb *cb, bool from_schedule)
+{
+ struct dm_offload *o = container_of(cb, struct dm_offload, cb);
+ struct bio_list list;
+ struct bio *bio;
+ int i;
+
+ INIT_LIST_HEAD(&o->cb.list);
+
+ if (unlikely(!current->bio_list))
+ return;
+
+ for (i = 0; i < 2; i++) {
+ list = current->bio_list[i];
+ bio_list_init(&current->bio_list[i]);
+
+ while ((bio = bio_list_pop(&list))) {
+ struct bio_set *bs = bio->bi_pool;
+ if (unlikely(!bs) || bs == fs_bio_set ||
+ !bs->rescue_workqueue) {
+ bio_list_add(&current->bio_list[i], bio);
+ continue;
+ }
+
+ spin_lock(&bs->rescue_lock);
+ bio_list_add(&bs->rescue_list, bio);
+ queue_work(bs->rescue_workqueue, &bs->rescue_work);
+ spin_unlock(&bs->rescue_lock);
+ }
+ }
+}
+
+static void dm_offload_start(struct dm_offload *o)
+{
+ blk_start_plug(&o->plug);
+ o->cb.callback = flush_current_bio_list;
+ list_add(&o->cb.list, &current->plug->cb_list);
+}
+
+static void dm_offload_end(struct dm_offload *o)
+{
+ list_del(&o->cb.list);
+ blk_finish_plug(&o->plug);
+}
+
static void __map_bio(struct dm_target_io *tio)
{
int r;
sector_t sector;
+ struct dm_offload o;
struct bio *clone = &tio->clone;
struct dm_target *ti = tio->ti;
@@ -988,19 +1205,29 @@ static void __map_bio(struct dm_target_io *tio)
*/
atomic_inc(&tio->io->io_count);
sector = clone->bi_iter.bi_sector;
+
+ dm_offload_start(&o);
r = ti->type->map(ti, clone);
- if (r == DM_MAPIO_REMAPPED) {
- /* the bio has been remapped so dispatch it */
+ dm_offload_end(&o);
+ switch (r) {
+ case DM_MAPIO_SUBMITTED:
+ break;
+ case DM_MAPIO_REMAPPED:
+ /* the bio has been remapped so dispatch it */
trace_block_bio_remap(bdev_get_queue(clone->bi_bdev), clone,
tio->io->bio->bi_bdev->bd_dev, sector);
-
generic_make_request(clone);
- } else if (r < 0 || r == DM_MAPIO_REQUEUE) {
- /* error the io and bail out, or requeue it if needed */
- dec_pending(tio->io, r);
+ break;
+ case DM_MAPIO_KILL:
+ dec_pending(tio->io, BLK_STS_IOERR);
free_tio(tio);
- } else if (r != DM_MAPIO_SUBMITTED) {
+ break;
+ case DM_MAPIO_REQUEUE:
+ dec_pending(tio->io, BLK_STS_DM_REQUEUE);
+ free_tio(tio);
+ break;
+ default:
DMWARN("unimplemented target map return value: %d", r);
BUG();
}
@@ -1031,17 +1258,28 @@ static int clone_bio(struct dm_target_io *tio, struct bio *bio,
__bio_clone_fast(clone, bio);
- if (bio_integrity(bio)) {
- int r = bio_integrity_clone(clone, bio, GFP_NOIO);
+ if (unlikely(bio_integrity(bio) != NULL)) {
+ int r;
+
+ if (unlikely(!dm_target_has_integrity(tio->ti->type) &&
+ !dm_target_passes_integrity(tio->ti->type))) {
+ DMWARN("%s: the target %s doesn't support integrity data.",
+ dm_device_name(tio->io->md),
+ tio->ti->type->name);
+ return -EIO;
+ }
+
+ r = bio_integrity_clone(clone, bio, GFP_NOIO);
if (r < 0)
return r;
}
- bio_advance(clone, to_bytes(sector - clone->bi_iter.bi_sector));
+ if (bio_op(bio) != REQ_OP_ZONE_REPORT)
+ bio_advance(clone, to_bytes(sector - clone->bi_iter.bi_sector));
clone->bi_iter.bi_size = to_bytes(len);
- if (bio_integrity(bio))
- bio_integrity_trim(clone, 0, len);
+ if (unlikely(bio_integrity(bio) != NULL))
+ bio_integrity_trim(clone);
return 0;
}
@@ -1141,6 +1379,11 @@ static unsigned get_num_write_same_bios(struct dm_target *ti)
return ti->num_write_same_bios;
}
+static unsigned get_num_write_zeroes_bios(struct dm_target *ti)
+{
+ return ti->num_write_zeroes_bios;
+}
+
typedef bool (*is_split_required_fn)(struct dm_target *ti);
static bool is_split_required_for_discard(struct dm_target *ti)
@@ -1195,6 +1438,11 @@ static int __send_write_same(struct clone_info *ci)
return __send_changing_extent_only(ci, get_num_write_same_bios, NULL);
}
+static int __send_write_zeroes(struct clone_info *ci)
+{
+ return __send_changing_extent_only(ci, get_num_write_zeroes_bios, NULL);
+}
+
/*
* Select the correct strategy for processing a non-flush bio.
*/
@@ -1209,12 +1457,18 @@ static int __split_and_process_non_flush(struct clone_info *ci)
return __send_discard(ci);
else if (unlikely(bio_op(bio) == REQ_OP_WRITE_SAME))
return __send_write_same(ci);
+ else if (unlikely(bio_op(bio) == REQ_OP_WRITE_ZEROES))
+ return __send_write_zeroes(ci);
ti = dm_table_find_target(ci->map, ci->sector);
if (!dm_target_is_valid(ti))
return -EIO;
- len = min_t(sector_t, max_io_len(ci->sector, ti), ci->sector_count);
+ if (bio_op(bio) == REQ_OP_ZONE_REPORT)
+ len = ci->sector_count;
+ else
+ len = min_t(sector_t, max_io_len(ci->sector, ti),
+ ci->sector_count);
r = __clone_and_map_data_bio(ci, ti, ci->sector, &len);
if (r < 0)
@@ -1243,7 +1497,7 @@ static void __split_and_process_bio(struct mapped_device *md,
ci.map = map;
ci.md = md;
ci.io = alloc_io(md);
- ci.io->error = 0;
+ ci.io->status = 0;
atomic_set(&ci.io->io_count, 1);
ci.io->bio = bio;
ci.io->md = md;
@@ -1257,6 +1511,10 @@ static void __split_and_process_bio(struct mapped_device *md,
ci.sector_count = 0;
error = __send_empty_flush(&ci);
/* dec_pending submits any data associated with flush */
+ } else if (bio_op(bio) == REQ_OP_ZONE_RESET) {
+ ci.bio = bio;
+ ci.sector_count = 0;
+ error = __split_and_process_non_flush(&ci);
} else {
ci.bio = bio;
ci.sector_count = bio_sectors(bio);
@@ -1314,7 +1572,7 @@ static int dm_any_congested(void *congested_data, int bdi_bits)
* With request-based DM we only need to check the
* top-level queue for congestion.
*/
- r = md->queue->backing_dev_info.wb.state & bdi_bits;
+ r = md->queue->backing_dev_info->wb.state & bdi_bits;
} else {
map = dm_get_live_table_fast(md);
if (map)
@@ -1376,6 +1634,7 @@ static int next_free_minor(int *minor)
}
static const struct block_device_operations dm_blk_dops;
+static const struct dax_operations dm_dax_ops;
static void dm_wq_work(struct work_struct *work);
@@ -1397,7 +1656,7 @@ void dm_init_md_queue(struct mapped_device *md)
* - must do so here (in alloc_dev callchain) before queue is used
*/
md->queue->queuedata = md;
- md->queue->backing_dev_info.congested_data = md;
+ md->queue->backing_dev_info->congested_data = md;
}
void dm_init_normal_md_queue(struct mapped_device *md)
@@ -1408,8 +1667,7 @@ void dm_init_normal_md_queue(struct mapped_device *md)
/*
* Initialize aspects of queue that aren't relevant for blk-mq
*/
- md->queue->backing_dev_info.congested_fn = dm_any_congested;
- blk_queue_bounce_limit(md->queue, BLK_BOUNCE_ANY);
+ md->queue->backing_dev_info->congested_fn = dm_any_congested;
}
static void cleanup_mapped_device(struct mapped_device *md)
@@ -1419,10 +1677,15 @@ static void cleanup_mapped_device(struct mapped_device *md)
if (md->kworker_task)
kthread_stop(md->kworker_task);
mempool_destroy(md->io_pool);
- mempool_destroy(md->rq_pool);
if (md->bs)
bioset_free(md->bs);
+ if (md->dax_dev) {
+ kill_dax(md->dax_dev);
+ put_dax(md->dax_dev);
+ md->dax_dev = NULL;
+ }
+
if (md->disk) {
spin_lock(&_minor_lock);
md->disk->private_data = NULL;
@@ -1450,6 +1713,7 @@ static void cleanup_mapped_device(struct mapped_device *md)
static struct mapped_device *alloc_dev(int minor)
{
int r, numa_node_id = dm_get_numa_node();
+ struct dax_device *dax_dev;
struct mapped_device *md;
void *old_md;
@@ -1514,6 +1778,12 @@ static struct mapped_device *alloc_dev(int minor)
md->disk->queue = md->queue;
md->disk->private_data = md;
sprintf(md->disk->disk_name, "dm-%d", minor);
+
+ dax_dev = alloc_dax(md, md->disk->disk_name, &dm_dax_ops);
+ if (!dax_dev)
+ goto bad;
+ md->dax_dev = dax_dev;
+
add_disk(md->disk);
format_dev_t(md->name, MKDEV(_major, minor));
@@ -1527,7 +1797,7 @@ static struct mapped_device *alloc_dev(int minor)
bio_init(&md->flush_bio, NULL, 0);
md->flush_bio.bi_bdev = md->bdev;
- md->flush_bio.bi_opf = REQ_OP_WRITE | REQ_PREFLUSH;
+ md->flush_bio.bi_opf = REQ_OP_WRITE | REQ_PREFLUSH | REQ_SYNC;
dm_stats_init(&md->stats);
@@ -1595,12 +1865,10 @@ static void __bind_mempools(struct mapped_device *md, struct dm_table *t)
goto out;
}
- BUG_ON(!p || md->io_pool || md->rq_pool || md->bs);
+ BUG_ON(!p || md->io_pool || md->bs);
md->io_pool = p->io_pool;
p->io_pool = NULL;
- md->rq_pool = p->rq_pool;
- p->rq_pool = NULL;
md->bs = p->bs;
p->bs = NULL;
@@ -1625,7 +1893,9 @@ static void event_callback(void *context)
dm_send_uevents(&uevents, &disk_to_dev(md->disk)->kobj);
atomic_inc(&md->event_nr);
+ atomic_inc(&dm_global_event_nr);
wake_up(&md->eventq);
+ wake_up(&dm_global_eventq);
}
/*
@@ -1633,6 +1903,8 @@ static void event_callback(void *context)
*/
static void __set_size(struct mapped_device *md, sector_t size)
{
+ lockdep_assert_held(&md->suspend_lock);
+
set_capacity(md->disk, size);
i_size_write(md->bdev->bd_inode, (loff_t)size << SECTOR_SHIFT);
@@ -1740,13 +2012,13 @@ void dm_unlock_md_type(struct mapped_device *md)
mutex_unlock(&md->type_lock);
}
-void dm_set_md_type(struct mapped_device *md, unsigned type)
+void dm_set_md_type(struct mapped_device *md, enum dm_queue_mode type)
{
BUG_ON(!mutex_is_locked(&md->type_lock));
md->type = type;
}
-unsigned dm_get_md_type(struct mapped_device *md)
+enum dm_queue_mode dm_get_md_type(struct mapped_device *md)
{
return md->type;
}
@@ -1773,11 +2045,11 @@ EXPORT_SYMBOL_GPL(dm_get_queue_limits);
int dm_setup_md_queue(struct mapped_device *md, struct dm_table *t)
{
int r;
- unsigned type = dm_get_md_type(md);
+ enum dm_queue_mode type = dm_get_md_type(md);
switch (type) {
case DM_TYPE_REQUEST_BASED:
- r = dm_old_init_request_queue(md);
+ r = dm_old_init_request_queue(md, t);
if (r) {
DMERR("Cannot initialize queue for request-based mapped device");
return r;
@@ -1804,6 +2076,9 @@ int dm_setup_md_queue(struct mapped_device *md, struct dm_table *t)
if (type == DM_TYPE_DAX_BIO_BASED)
queue_flag_set_unlocked(QUEUE_FLAG_DAX, md->queue);
break;
+ case DM_TYPE_NONE:
+ WARN_ON_ONCE(true);
+ break;
}
return 0;
@@ -2082,8 +2357,6 @@ static void unlock_fs(struct mapped_device *md)
* If __dm_suspend returns 0, the device is completely quiescent
* now. There is no request-processing activity. All new requests
* are being added to md->deferred list.
- *
- * Caller must hold md->suspend_lock
*/
static int __dm_suspend(struct mapped_device *md, struct dm_table *map,
unsigned suspend_flags, long task_state,
@@ -2101,6 +2374,8 @@ static int __dm_suspend(struct mapped_device *md, struct dm_table *map,
*/
if (noflush)
set_bit(DMF_NOFLUSH_SUSPENDING, &md->flags);
+ else
+ pr_debug("%s: suspending with flush\n", dm_device_name(md));
/*
* This gets reverted if there's an error later and the targets
@@ -2299,6 +2574,8 @@ static void __dm_internal_suspend(struct mapped_device *md, unsigned suspend_fla
{
struct dm_table *map = NULL;
+ lockdep_assert_held(&md->suspend_lock);
+
if (md->internal_suspend_count++)
return; /* nested internal suspend */
@@ -2489,11 +2766,10 @@ int dm_noflush_suspending(struct dm_target *ti)
}
EXPORT_SYMBOL_GPL(dm_noflush_suspending);
-struct dm_md_mempools *dm_alloc_md_mempools(struct mapped_device *md, unsigned type,
+struct dm_md_mempools *dm_alloc_md_mempools(struct mapped_device *md, enum dm_queue_mode type,
unsigned integrity, unsigned per_io_data_size)
{
struct dm_md_mempools *pools = kzalloc_node(sizeof(*pools), GFP_KERNEL, md->numa_node_id);
- struct kmem_cache *cachep = NULL;
unsigned int pool_size = 0;
unsigned int front_pad;
@@ -2503,20 +2779,16 @@ struct dm_md_mempools *dm_alloc_md_mempools(struct mapped_device *md, unsigned t
switch (type) {
case DM_TYPE_BIO_BASED:
case DM_TYPE_DAX_BIO_BASED:
- cachep = _io_cache;
pool_size = dm_get_reserved_bio_based_ios();
front_pad = roundup(per_io_data_size, __alignof__(struct dm_target_io)) + offsetof(struct dm_target_io, clone);
+
+ pools->io_pool = mempool_create_slab_pool(pool_size, _io_cache);
+ if (!pools->io_pool)
+ goto out;
break;
case DM_TYPE_REQUEST_BASED:
- cachep = _rq_tio_cache;
- pool_size = dm_get_reserved_rq_based_ios();
- pools->rq_pool = mempool_create_slab_pool(pool_size, _rq_cache);
- if (!pools->rq_pool)
- goto out;
- /* fall through to setup remaining rq-based pools */
case DM_TYPE_MQ_REQUEST_BASED:
- if (!pool_size)
- pool_size = dm_get_reserved_rq_based_ios();
+ pool_size = dm_get_reserved_rq_based_ios();
front_pad = offsetof(struct dm_rq_clone_bio_info, clone);
/* per_io_data_size is used for blk-mq pdu at queue allocation */
break;
@@ -2524,13 +2796,7 @@ struct dm_md_mempools *dm_alloc_md_mempools(struct mapped_device *md, unsigned t
BUG();
}
- if (cachep) {
- pools->io_pool = mempool_create_slab_pool(pool_size, cachep);
- if (!pools->io_pool)
- goto out;
- }
-
- pools->bs = bioset_create_nobvec(pool_size, front_pad);
+ pools->bs = bioset_create(pool_size, front_pad, BIOSET_NEED_RESCUER);
if (!pools->bs)
goto out;
@@ -2551,7 +2817,6 @@ void dm_free_md_mempools(struct dm_md_mempools *pools)
return;
mempool_destroy(pools->io_pool);
- mempool_destroy(pools->rq_pool);
if (pools->bs)
bioset_free(pools->bs);
@@ -2729,12 +2994,17 @@ static const struct block_device_operations dm_blk_dops = {
.open = dm_blk_open,
.release = dm_blk_close,
.ioctl = dm_blk_ioctl,
- .direct_access = dm_blk_direct_access,
.getgeo = dm_blk_getgeo,
.pr_ops = &dm_pr_ops,
.owner = THIS_MODULE
};
+static const struct dax_operations dm_dax_ops = {
+ .direct_access = dm_dax_direct_access,
+ .copy_from_iter = dm_dax_copy_from_iter,
+ .flush = dm_dax_flush,
+};
+
/*
* module hooks
*/
diff --git a/drivers/md/dm.h b/drivers/md/dm.h
index f0aad08b9654..38c84c0a35d4 100644
--- a/drivers/md/dm.h
+++ b/drivers/md/dm.h
@@ -64,7 +64,7 @@ void dm_table_presuspend_undo_targets(struct dm_table *t);
void dm_table_postsuspend_targets(struct dm_table *t);
int dm_table_resume_targets(struct dm_table *t);
int dm_table_any_congested(struct dm_table *t, int bdi_bits);
-unsigned dm_table_get_type(struct dm_table *t);
+enum dm_queue_mode dm_table_get_type(struct dm_table *t);
struct target_type *dm_table_get_immutable_target_type(struct dm_table *t);
struct dm_target *dm_table_get_immutable_target(struct dm_table *t);
struct dm_target *dm_table_get_wildcard_target(struct dm_table *t);
@@ -76,8 +76,8 @@ struct dm_md_mempools *dm_table_get_md_mempools(struct dm_table *t);
void dm_lock_md_type(struct mapped_device *md);
void dm_unlock_md_type(struct mapped_device *md);
-void dm_set_md_type(struct mapped_device *md, unsigned type);
-unsigned dm_get_md_type(struct mapped_device *md);
+void dm_set_md_type(struct mapped_device *md, enum dm_queue_mode type);
+enum dm_queue_mode dm_get_md_type(struct mapped_device *md);
struct target_type *dm_get_immutable_target_type(struct mapped_device *md);
int dm_setup_md_queue(struct mapped_device *md, struct dm_table *t);
@@ -95,8 +95,7 @@ int dm_setup_md_queue(struct mapped_device *md, struct dm_table *t);
/*
* To check whether the target type is request-based or not (bio-based).
*/
-#define dm_target_request_based(t) (((t)->type->map_rq != NULL) || \
- ((t)->type->clone_and_map_rq != NULL))
+#define dm_target_request_based(t) ((t)->type->clone_and_map_rq != NULL)
/*
* To check whether the target type is a hybrid (capable of being
@@ -205,7 +204,7 @@ void dm_kcopyd_exit(void);
/*
* Mempool operations
*/
-struct dm_md_mempools *dm_alloc_md_mempools(struct mapped_device *md, unsigned type,
+struct dm_md_mempools *dm_alloc_md_mempools(struct mapped_device *md, enum dm_queue_mode type,
unsigned integrity, unsigned per_bio_data_size);
void dm_free_md_mempools(struct dm_md_mempools *pools);
diff --git a/drivers/md/faulty.c b/drivers/md/faulty.c
index 685aa2d77e25..06a64d5d8c6c 100644
--- a/drivers/md/faulty.c
+++ b/drivers/md/faulty.c
@@ -170,7 +170,7 @@ static void add_sector(struct faulty_conf *conf, sector_t start, int mode)
conf->nfaults = n+1;
}
-static void faulty_make_request(struct mddev *mddev, struct bio *bio)
+static bool faulty_make_request(struct mddev *mddev, struct bio *bio)
{
struct faulty_conf *conf = mddev->private;
int failit = 0;
@@ -182,7 +182,7 @@ static void faulty_make_request(struct mddev *mddev, struct bio *bio)
* just fail immediately
*/
bio_io_error(bio);
- return;
+ return true;
}
if (check_sector(conf, bio->bi_iter.bi_sector,
@@ -214,7 +214,7 @@ static void faulty_make_request(struct mddev *mddev, struct bio *bio)
}
}
if (failit) {
- struct bio *b = bio_clone_mddev(bio, GFP_NOIO, mddev);
+ struct bio *b = bio_clone_fast(bio, GFP_NOIO, mddev->bio_set);
b->bi_bdev = conf->rdev->bdev;
b->bi_private = bio;
@@ -224,6 +224,7 @@ static void faulty_make_request(struct mddev *mddev, struct bio *bio)
bio->bi_bdev = conf->rdev->bdev;
generic_make_request(bio);
+ return true;
}
static void faulty_status(struct seq_file *seq, struct mddev *mddev)
diff --git a/drivers/md/linear.c b/drivers/md/linear.c
index 5975c9915684..5f1eb9189542 100644
--- a/drivers/md/linear.c
+++ b/drivers/md/linear.c
@@ -53,18 +53,26 @@ static inline struct dev_info *which_dev(struct mddev *mddev, sector_t sector)
return conf->disks + lo;
}
+/*
+ * In linear_congested() conf->raid_disks is used as a copy of
+ * mddev->raid_disks to iterate conf->disks[], because conf->raid_disks
+ * and conf->disks[] are created in linear_conf(), they are always
+ * consitent with each other, but mddev->raid_disks does not.
+ */
static int linear_congested(struct mddev *mddev, int bits)
{
struct linear_conf *conf;
int i, ret = 0;
- conf = mddev->private;
+ rcu_read_lock();
+ conf = rcu_dereference(mddev->private);
- for (i = 0; i < mddev->raid_disks && !ret ; i++) {
+ for (i = 0; i < conf->raid_disks && !ret ; i++) {
struct request_queue *q = bdev_get_queue(conf->disks[i].rdev->bdev);
- ret |= bdi_congested(&q->backing_dev_info, bits);
+ ret |= bdi_congested(q->backing_dev_info, bits);
}
+ rcu_read_unlock();
return ret;
}
@@ -144,6 +152,19 @@ static struct linear_conf *linear_conf(struct mddev *mddev, int raid_disks)
conf->disks[i-1].end_sector +
conf->disks[i].rdev->sectors;
+ /*
+ * conf->raid_disks is copy of mddev->raid_disks. The reason to
+ * keep a copy of mddev->raid_disks in struct linear_conf is,
+ * mddev->raid_disks may not be consistent with pointers number of
+ * conf->disks[] when it is updated in linear_add() and used to
+ * iterate old conf->disks[] earray in linear_congested().
+ * Here conf->raid_disks is always consitent with number of
+ * pointers in conf->disks[] array, and mddev->private is updated
+ * with rcu_assign_pointer() in linear_addr(), such race can be
+ * avoided.
+ */
+ conf->raid_disks = raid_disks;
+
return conf;
out:
@@ -196,15 +217,24 @@ static int linear_add(struct mddev *mddev, struct md_rdev *rdev)
if (!newconf)
return -ENOMEM;
+ /* newconf->raid_disks already keeps a copy of * the increased
+ * value of mddev->raid_disks, WARN_ONCE() is just used to make
+ * sure of this. It is possible that oldconf is still referenced
+ * in linear_congested(), therefore kfree_rcu() is used to free
+ * oldconf until no one uses it anymore.
+ */
mddev_suspend(mddev);
- oldconf = mddev->private;
+ oldconf = rcu_dereference_protected(mddev->private,
+ lockdep_is_held(&mddev->reconfig_mutex));
mddev->raid_disks++;
- mddev->private = newconf;
+ WARN_ONCE(mddev->raid_disks != newconf->raid_disks,
+ "copied raid_disks doesn't match mddev->raid_disks");
+ rcu_assign_pointer(mddev->private, newconf);
md_set_array_sectors(mddev, linear_size(mddev, 0, 0));
set_capacity(mddev->gendisk, mddev->array_sectors);
mddev_resume(mddev);
revalidate_disk(mddev->gendisk);
- kfree(oldconf);
+ kfree_rcu(oldconf, rcu);
return 0;
}
@@ -215,57 +245,54 @@ static void linear_free(struct mddev *mddev, void *priv)
kfree(conf);
}
-static void linear_make_request(struct mddev *mddev, struct bio *bio)
+static bool linear_make_request(struct mddev *mddev, struct bio *bio)
{
char b[BDEVNAME_SIZE];
struct dev_info *tmp_dev;
- struct bio *split;
sector_t start_sector, end_sector, data_offset;
+ sector_t bio_sector = bio->bi_iter.bi_sector;
if (unlikely(bio->bi_opf & REQ_PREFLUSH)) {
md_flush_request(mddev, bio);
- return;
+ return true;
}
- do {
- sector_t bio_sector = bio->bi_iter.bi_sector;
- tmp_dev = which_dev(mddev, bio_sector);
- start_sector = tmp_dev->end_sector - tmp_dev->rdev->sectors;
- end_sector = tmp_dev->end_sector;
- data_offset = tmp_dev->rdev->data_offset;
- bio->bi_bdev = tmp_dev->rdev->bdev;
-
- if (unlikely(bio_sector >= end_sector ||
- bio_sector < start_sector))
- goto out_of_bounds;
-
- if (unlikely(bio_end_sector(bio) > end_sector)) {
- /* This bio crosses a device boundary, so we have to
- * split it.
- */
- split = bio_split(bio, end_sector - bio_sector,
- GFP_NOIO, fs_bio_set);
- bio_chain(split, bio);
- } else {
- split = bio;
- }
+ tmp_dev = which_dev(mddev, bio_sector);
+ start_sector = tmp_dev->end_sector - tmp_dev->rdev->sectors;
+ end_sector = tmp_dev->end_sector;
+ data_offset = tmp_dev->rdev->data_offset;
+
+ if (unlikely(bio_sector >= end_sector ||
+ bio_sector < start_sector))
+ goto out_of_bounds;
+
+ if (unlikely(bio_end_sector(bio) > end_sector)) {
+ /* This bio crosses a device boundary, so we have to split it */
+ struct bio *split = bio_split(bio, end_sector - bio_sector,
+ GFP_NOIO, mddev->bio_set);
+ bio_chain(split, bio);
+ generic_make_request(bio);
+ bio = split;
+ }
- split->bi_iter.bi_sector = split->bi_iter.bi_sector -
- start_sector + data_offset;
-
- if (unlikely((bio_op(split) == REQ_OP_DISCARD) &&
- !blk_queue_discard(bdev_get_queue(split->bi_bdev)))) {
- /* Just ignore it */
- bio_endio(split);
- } else {
- if (mddev->gendisk)
- trace_block_bio_remap(bdev_get_queue(split->bi_bdev),
- split, disk_devt(mddev->gendisk),
- bio_sector);
- generic_make_request(split);
- }
- } while (split != bio);
- return;
+ bio->bi_bdev = tmp_dev->rdev->bdev;
+ bio->bi_iter.bi_sector = bio->bi_iter.bi_sector -
+ start_sector + data_offset;
+
+ if (unlikely((bio_op(bio) == REQ_OP_DISCARD) &&
+ !blk_queue_discard(bdev_get_queue(bio->bi_bdev)))) {
+ /* Just ignore it */
+ bio_endio(bio);
+ } else {
+ if (mddev->gendisk)
+ trace_block_bio_remap(bdev_get_queue(bio->bi_bdev),
+ bio, disk_devt(mddev->gendisk),
+ bio_sector);
+ mddev_check_writesame(mddev, bio);
+ mddev_check_write_zeroes(mddev, bio);
+ generic_make_request(bio);
+ }
+ return true;
out_of_bounds:
pr_err("md/linear:%s: make_request: Sector %llu out of bounds on dev %s: %llu sectors, offset %llu\n",
@@ -275,6 +302,7 @@ out_of_bounds:
(unsigned long long)tmp_dev->rdev->sectors,
(unsigned long long)start_sector);
bio_io_error(bio);
+ return true;
}
static void linear_status (struct seq_file *seq, struct mddev *mddev)
diff --git a/drivers/md/linear.h b/drivers/md/linear.h
index b685ddd7d7f7..8d392e6098b3 100644
--- a/drivers/md/linear.h
+++ b/drivers/md/linear.h
@@ -10,6 +10,7 @@ struct linear_conf
{
struct rcu_head rcu;
sector_t array_sectors;
+ int raid_disks; /* a copy of mddev->raid_disks */
struct dev_info disks[0];
};
#endif
diff --git a/drivers/md/md-cluster.c b/drivers/md/md-cluster.c
index 2b13117fb918..03082e17c65c 100644
--- a/drivers/md/md-cluster.c
+++ b/drivers/md/md-cluster.c
@@ -67,9 +67,10 @@ struct resync_info {
* set up all the related infos such as bitmap and personality */
#define MD_CLUSTER_ALREADY_IN_CLUSTER 6
#define MD_CLUSTER_PENDING_RECV_EVENT 7
-
+#define MD_CLUSTER_HOLDING_MUTEX_FOR_RECVD 8
struct md_cluster_info {
+ struct mddev *mddev; /* the md device which md_cluster_info belongs to */
/* dlm lock space and resources for clustered raid. */
dlm_lockspace_t *lockspace;
int slot_number;
@@ -103,6 +104,7 @@ enum msg_type {
REMOVE,
RE_ADD,
BITMAP_NEEDS_SYNC,
+ CHANGE_CAPACITY,
};
struct cluster_msg {
@@ -523,11 +525,17 @@ static void process_add_new_disk(struct mddev *mddev, struct cluster_msg *cmsg)
static void process_metadata_update(struct mddev *mddev, struct cluster_msg *msg)
{
+ int got_lock = 0;
struct md_cluster_info *cinfo = mddev->cluster_info;
mddev->good_device_nr = le32_to_cpu(msg->raid_slot);
- set_bit(MD_RELOAD_SB, &mddev->flags);
+
dlm_lock_sync(cinfo->no_new_dev_lockres, DLM_LOCK_CR);
- md_wakeup_thread(mddev->thread);
+ wait_event(mddev->thread->wqueue,
+ (got_lock = mddev_trylock(mddev)) ||
+ test_bit(MD_CLUSTER_HOLDING_MUTEX_FOR_RECVD, &cinfo->state));
+ md_reload_sb(mddev, mddev->good_device_nr);
+ if (got_lock)
+ mddev_unlock(mddev);
}
static void process_remove_disk(struct mddev *mddev, struct cluster_msg *msg)
@@ -572,6 +580,10 @@ static int process_recvd_msg(struct mddev *mddev, struct cluster_msg *msg)
case METADATA_UPDATED:
process_metadata_update(mddev, msg);
break;
+ case CHANGE_CAPACITY:
+ set_capacity(mddev->gendisk, mddev->array_sectors);
+ revalidate_disk(mddev->gendisk);
+ break;
case RESYNCING:
process_suspend_info(mddev, le32_to_cpu(msg->slot),
le64_to_cpu(msg->low),
@@ -646,11 +658,29 @@ out:
* Takes the lock on the TOKEN lock resource so no other
* node can communicate while the operation is underway.
*/
-static int lock_token(struct md_cluster_info *cinfo)
+static int lock_token(struct md_cluster_info *cinfo, bool mddev_locked)
{
- int error;
+ int error, set_bit = 0;
+ struct mddev *mddev = cinfo->mddev;
+ /*
+ * If resync thread run after raid1d thread, then process_metadata_update
+ * could not continue if raid1d held reconfig_mutex (and raid1d is blocked
+ * since another node already got EX on Token and waitting the EX of Ack),
+ * so let resync wake up thread in case flag is set.
+ */
+ if (mddev_locked && !test_bit(MD_CLUSTER_HOLDING_MUTEX_FOR_RECVD,
+ &cinfo->state)) {
+ error = test_and_set_bit_lock(MD_CLUSTER_HOLDING_MUTEX_FOR_RECVD,
+ &cinfo->state);
+ WARN_ON_ONCE(error);
+ md_wakeup_thread(mddev->thread);
+ set_bit = 1;
+ }
error = dlm_lock_sync(cinfo->token_lockres, DLM_LOCK_EX);
+ if (set_bit)
+ clear_bit_unlock(MD_CLUSTER_HOLDING_MUTEX_FOR_RECVD, &cinfo->state);
+
if (error)
pr_err("md-cluster(%s:%d): failed to get EX on TOKEN (%d)\n",
__func__, __LINE__, error);
@@ -663,12 +693,12 @@ static int lock_token(struct md_cluster_info *cinfo)
/* lock_comm()
* Sets the MD_CLUSTER_SEND_LOCK bit to lock the send channel.
*/
-static int lock_comm(struct md_cluster_info *cinfo)
+static int lock_comm(struct md_cluster_info *cinfo, bool mddev_locked)
{
wait_event(cinfo->wait,
!test_and_set_bit(MD_CLUSTER_SEND_LOCK, &cinfo->state));
- return lock_token(cinfo);
+ return lock_token(cinfo, mddev_locked);
}
static void unlock_comm(struct md_cluster_info *cinfo)
@@ -743,11 +773,12 @@ failed_message:
return error;
}
-static int sendmsg(struct md_cluster_info *cinfo, struct cluster_msg *cmsg)
+static int sendmsg(struct md_cluster_info *cinfo, struct cluster_msg *cmsg,
+ bool mddev_locked)
{
int ret;
- lock_comm(cinfo);
+ lock_comm(cinfo, mddev_locked);
ret = __sendmsg(cinfo, cmsg);
unlock_comm(cinfo);
return ret;
@@ -777,7 +808,6 @@ static int gather_all_resync_info(struct mddev *mddev, int total_slots)
bm_lockres->flags |= DLM_LKF_NOQUEUE;
ret = dlm_lock_sync(bm_lockres, DLM_LOCK_PW);
if (ret == -EAGAIN) {
- memset(bm_lockres->lksb.sb_lvbptr, '\0', LVB_SIZE);
s = read_resync_info(mddev, bm_lockres);
if (s) {
pr_info("%s:%d Resync[%llu..%llu] in progress on %d\n",
@@ -835,6 +865,7 @@ static int join(struct mddev *mddev, int nodes)
mutex_init(&cinfo->recv_mutex);
mddev->cluster_info = cinfo;
+ cinfo->mddev = mddev;
memset(str, 0, 64);
sprintf(str, "%pU", mddev->uuid);
@@ -909,6 +940,7 @@ static int join(struct mddev *mddev, int nodes)
return 0;
err:
+ set_bit(MD_CLUSTER_HOLDING_MUTEX_FOR_RECVD, &cinfo->state);
md_unregister_thread(&cinfo->recovery_thread);
md_unregister_thread(&cinfo->recv_thread);
lockres_free(cinfo->message_lockres);
@@ -944,7 +976,7 @@ static void resync_bitmap(struct mddev *mddev)
int err;
cmsg.type = cpu_to_le32(BITMAP_NEEDS_SYNC);
- err = sendmsg(cinfo, &cmsg);
+ err = sendmsg(cinfo, &cmsg, 1);
if (err)
pr_err("%s:%d: failed to send BITMAP_NEEDS_SYNC message (%d)\n",
__func__, __LINE__, err);
@@ -964,6 +996,7 @@ static int leave(struct mddev *mddev)
if (cinfo->slot_number > 0 && mddev->recovery_cp != MaxSector)
resync_bitmap(mddev);
+ set_bit(MD_CLUSTER_HOLDING_MUTEX_FOR_RECVD, &cinfo->state);
md_unregister_thread(&cinfo->recovery_thread);
md_unregister_thread(&cinfo->recv_thread);
lockres_free(cinfo->message_lockres);
@@ -974,6 +1007,7 @@ static int leave(struct mddev *mddev)
lockres_free(cinfo->bitmap_lockres);
unlock_all_bitmaps(mddev);
dlm_release_lockspace(cinfo->lockspace, 2);
+ kfree(cinfo);
return 0;
}
@@ -997,16 +1031,30 @@ static int slot_number(struct mddev *mddev)
static int metadata_update_start(struct mddev *mddev)
{
struct md_cluster_info *cinfo = mddev->cluster_info;
+ int ret;
+
+ /*
+ * metadata_update_start is always called with the protection of
+ * reconfig_mutex, so set WAITING_FOR_TOKEN here.
+ */
+ ret = test_and_set_bit_lock(MD_CLUSTER_HOLDING_MUTEX_FOR_RECVD,
+ &cinfo->state);
+ WARN_ON_ONCE(ret);
+ md_wakeup_thread(mddev->thread);
wait_event(cinfo->wait,
!test_and_set_bit(MD_CLUSTER_SEND_LOCK, &cinfo->state) ||
test_and_clear_bit(MD_CLUSTER_SEND_LOCKED_ALREADY, &cinfo->state));
/* If token is already locked, return 0 */
- if (cinfo->token_lockres->mode == DLM_LOCK_EX)
+ if (cinfo->token_lockres->mode == DLM_LOCK_EX) {
+ clear_bit_unlock(MD_CLUSTER_HOLDING_MUTEX_FOR_RECVD, &cinfo->state);
return 0;
+ }
- return lock_token(cinfo);
+ ret = lock_token(cinfo, 1);
+ clear_bit_unlock(MD_CLUSTER_HOLDING_MUTEX_FOR_RECVD, &cinfo->state);
+ return ret;
}
static int metadata_update_finish(struct mddev *mddev)
@@ -1043,6 +1091,141 @@ static void metadata_update_cancel(struct mddev *mddev)
unlock_comm(cinfo);
}
+/*
+ * return 0 if all the bitmaps have the same sync_size
+ */
+int cluster_check_sync_size(struct mddev *mddev)
+{
+ int i, rv;
+ bitmap_super_t *sb;
+ unsigned long my_sync_size, sync_size = 0;
+ int node_num = mddev->bitmap_info.nodes;
+ int current_slot = md_cluster_ops->slot_number(mddev);
+ struct bitmap *bitmap = mddev->bitmap;
+ char str[64];
+ struct dlm_lock_resource *bm_lockres;
+
+ sb = kmap_atomic(bitmap->storage.sb_page);
+ my_sync_size = sb->sync_size;
+ kunmap_atomic(sb);
+
+ for (i = 0; i < node_num; i++) {
+ if (i == current_slot)
+ continue;
+
+ bitmap = get_bitmap_from_slot(mddev, i);
+ if (IS_ERR(bitmap)) {
+ pr_err("can't get bitmap from slot %d\n", i);
+ return -1;
+ }
+
+ /*
+ * If we can hold the bitmap lock of one node then
+ * the slot is not occupied, update the sb.
+ */
+ snprintf(str, 64, "bitmap%04d", i);
+ bm_lockres = lockres_init(mddev, str, NULL, 1);
+ if (!bm_lockres) {
+ pr_err("md-cluster: Cannot initialize %s\n", str);
+ bitmap_free(bitmap);
+ return -1;
+ }
+ bm_lockres->flags |= DLM_LKF_NOQUEUE;
+ rv = dlm_lock_sync(bm_lockres, DLM_LOCK_PW);
+ if (!rv)
+ bitmap_update_sb(bitmap);
+ lockres_free(bm_lockres);
+
+ sb = kmap_atomic(bitmap->storage.sb_page);
+ if (sync_size == 0)
+ sync_size = sb->sync_size;
+ else if (sync_size != sb->sync_size) {
+ kunmap_atomic(sb);
+ bitmap_free(bitmap);
+ return -1;
+ }
+ kunmap_atomic(sb);
+ bitmap_free(bitmap);
+ }
+
+ return (my_sync_size == sync_size) ? 0 : -1;
+}
+
+/*
+ * Update the size for cluster raid is a little more complex, we perform it
+ * by the steps:
+ * 1. hold token lock and update superblock in initiator node.
+ * 2. send METADATA_UPDATED msg to other nodes.
+ * 3. The initiator node continues to check each bitmap's sync_size, if all
+ * bitmaps have the same value of sync_size, then we can set capacity and
+ * let other nodes to perform it. If one node can't update sync_size
+ * accordingly, we need to revert to previous value.
+ */
+static void update_size(struct mddev *mddev, sector_t old_dev_sectors)
+{
+ struct md_cluster_info *cinfo = mddev->cluster_info;
+ struct cluster_msg cmsg;
+ struct md_rdev *rdev;
+ int ret = 0;
+ int raid_slot = -1;
+
+ md_update_sb(mddev, 1);
+ lock_comm(cinfo, 1);
+
+ memset(&cmsg, 0, sizeof(cmsg));
+ cmsg.type = cpu_to_le32(METADATA_UPDATED);
+ rdev_for_each(rdev, mddev)
+ if (rdev->raid_disk >= 0 && !test_bit(Faulty, &rdev->flags)) {
+ raid_slot = rdev->desc_nr;
+ break;
+ }
+ if (raid_slot >= 0) {
+ cmsg.raid_slot = cpu_to_le32(raid_slot);
+ /*
+ * We can only change capiticy after all the nodes can do it,
+ * so need to wait after other nodes already received the msg
+ * and handled the change
+ */
+ ret = __sendmsg(cinfo, &cmsg);
+ if (ret) {
+ pr_err("%s:%d: failed to send METADATA_UPDATED msg\n",
+ __func__, __LINE__);
+ unlock_comm(cinfo);
+ return;
+ }
+ } else {
+ pr_err("md-cluster: No good device id found to send\n");
+ unlock_comm(cinfo);
+ return;
+ }
+
+ /*
+ * check the sync_size from other node's bitmap, if sync_size
+ * have already updated in other nodes as expected, send an
+ * empty metadata msg to permit the change of capacity
+ */
+ if (cluster_check_sync_size(mddev) == 0) {
+ memset(&cmsg, 0, sizeof(cmsg));
+ cmsg.type = cpu_to_le32(CHANGE_CAPACITY);
+ ret = __sendmsg(cinfo, &cmsg);
+ if (ret)
+ pr_err("%s:%d: failed to send CHANGE_CAPACITY msg\n",
+ __func__, __LINE__);
+ set_capacity(mddev->gendisk, mddev->array_sectors);
+ revalidate_disk(mddev->gendisk);
+ } else {
+ /* revert to previous sectors */
+ ret = mddev->pers->resize(mddev, old_dev_sectors);
+ if (!ret)
+ revalidate_disk(mddev->gendisk);
+ ret = __sendmsg(cinfo, &cmsg);
+ if (ret)
+ pr_err("%s:%d: failed to send METADATA_UPDATED msg\n",
+ __func__, __LINE__);
+ }
+ unlock_comm(cinfo);
+}
+
static int resync_start(struct mddev *mddev)
{
struct md_cluster_info *cinfo = mddev->cluster_info;
@@ -1069,7 +1252,14 @@ static int resync_info_update(struct mddev *mddev, sector_t lo, sector_t hi)
cmsg.low = cpu_to_le64(lo);
cmsg.high = cpu_to_le64(hi);
- return sendmsg(cinfo, &cmsg);
+ /*
+ * mddev_lock is held if resync_info_update is called from
+ * resync_finish (md_reap_sync_thread -> resync_finish)
+ */
+ if (lo == 0 && hi == 0)
+ return sendmsg(cinfo, &cmsg, 1);
+ else
+ return sendmsg(cinfo, &cmsg, 0);
}
static int resync_finish(struct mddev *mddev)
@@ -1119,10 +1309,12 @@ static int add_new_disk(struct mddev *mddev, struct md_rdev *rdev)
cmsg.type = cpu_to_le32(NEWDISK);
memcpy(cmsg.uuid, uuid, 16);
cmsg.raid_slot = cpu_to_le32(rdev->desc_nr);
- lock_comm(cinfo);
+ lock_comm(cinfo, 1);
ret = __sendmsg(cinfo, &cmsg);
- if (ret)
+ if (ret) {
+ unlock_comm(cinfo);
return ret;
+ }
cinfo->no_new_dev_lockres->flags |= DLM_LKF_NOQUEUE;
ret = dlm_lock_sync(cinfo->no_new_dev_lockres, DLM_LOCK_EX);
cinfo->no_new_dev_lockres->flags &= ~DLM_LKF_NOQUEUE;
@@ -1179,7 +1371,7 @@ static int remove_disk(struct mddev *mddev, struct md_rdev *rdev)
struct md_cluster_info *cinfo = mddev->cluster_info;
cmsg.type = cpu_to_le32(REMOVE);
cmsg.raid_slot = cpu_to_le32(rdev->desc_nr);
- return sendmsg(cinfo, &cmsg);
+ return sendmsg(cinfo, &cmsg, 1);
}
static int lock_all_bitmaps(struct mddev *mddev)
@@ -1243,7 +1435,7 @@ static int gather_bitmaps(struct md_rdev *rdev)
cmsg.type = cpu_to_le32(RE_ADD);
cmsg.raid_slot = cpu_to_le32(rdev->desc_nr);
- err = sendmsg(cinfo, &cmsg);
+ err = sendmsg(cinfo, &cmsg, 1);
if (err)
goto out;
@@ -1281,6 +1473,7 @@ static struct md_cluster_operations cluster_ops = {
.gather_bitmaps = gather_bitmaps,
.lock_all_bitmaps = lock_all_bitmaps,
.unlock_all_bitmaps = unlock_all_bitmaps,
+ .update_size = update_size,
};
static int __init cluster_init(void)
diff --git a/drivers/md/md-cluster.h b/drivers/md/md-cluster.h
index e765499ba591..274016177983 100644
--- a/drivers/md/md-cluster.h
+++ b/drivers/md/md-cluster.h
@@ -27,6 +27,7 @@ struct md_cluster_operations {
int (*gather_bitmaps)(struct md_rdev *rdev);
int (*lock_all_bitmaps)(struct mddev *mddev);
void (*unlock_all_bitmaps)(struct mddev *mddev);
+ void (*update_size)(struct mddev *mddev, sector_t old_dev_sectors);
};
#endif /* _MD_CLUSTER_H */
diff --git a/drivers/md/md.c b/drivers/md/md.c
index 01175dac0db6..8cdca0296749 100644
--- a/drivers/md/md.c
+++ b/drivers/md/md.c
@@ -44,6 +44,7 @@
*/
+#include <linux/sched/signal.h>
#include <linux/kthread.h>
#include <linux/blkdev.h>
#include <linux/badblocks.h>
@@ -64,6 +65,8 @@
#include <linux/raid/md_p.h>
#include <linux/raid/md_u.h>
#include <linux/slab.h>
+#include <linux/percpu-refcount.h>
+
#include <trace/events/block.h>
#include "md.h"
#include "bitmap.h"
@@ -171,8 +174,18 @@ static const struct block_device_operations md_fops;
static int start_readonly;
+/*
+ * The original mechanism for creating an md device is to create
+ * a device node in /dev and to open it. This causes races with device-close.
+ * The preferred method is to write to the "new_array" module parameter.
+ * This can avoid races.
+ * Setting create_on_open to false disables the original mechanism
+ * so all the races disappear.
+ */
+static bool create_on_open = true;
+
/* bio_clone_mddev
- * like bio_clone, but with a local bio set
+ * like bio_clone_bioset, but with a local bio set
*/
struct bio *bio_alloc_mddev(gfp_t gfp_mask, int nr_iovecs,
@@ -190,15 +203,13 @@ struct bio *bio_alloc_mddev(gfp_t gfp_mask, int nr_iovecs,
}
EXPORT_SYMBOL_GPL(bio_alloc_mddev);
-struct bio *bio_clone_mddev(struct bio *bio, gfp_t gfp_mask,
- struct mddev *mddev)
+static struct bio *md_bio_alloc_sync(struct mddev *mddev)
{
- if (!mddev || !mddev->bio_set)
- return bio_clone(bio, gfp_mask);
+ if (!mddev || !mddev->sync_set)
+ return bio_alloc(GFP_NOIO, 1);
- return bio_clone_bioset(bio, gfp_mask, mddev->bio_set);
+ return bio_alloc_bioset(GFP_NOIO, 1, mddev->sync_set);
}
-EXPORT_SYMBOL_GPL(bio_clone_mddev);
/*
* We have a system wide 'event count' that is incremented
@@ -262,7 +273,7 @@ static blk_qc_t md_make_request(struct request_queue *q, struct bio *bio)
unsigned int sectors;
int cpu;
- blk_queue_split(q, &bio, q->bio_split);
+ blk_queue_split(q, &bio);
if (mddev == NULL || mddev->pers == NULL) {
bio_io_error(bio);
@@ -270,11 +281,11 @@ static blk_qc_t md_make_request(struct request_queue *q, struct bio *bio)
}
if (mddev->ro == 1 && unlikely(rw == WRITE)) {
if (bio_sectors(bio) != 0)
- bio->bi_error = -EROFS;
+ bio->bi_status = BLK_STS_IOERR;
bio_endio(bio);
return BLK_QC_T_NONE;
}
- smp_rmb(); /* Ensure implications of 'active' are visible */
+check_suspended:
rcu_read_lock();
if (mddev->suspended) {
DEFINE_WAIT(__wait);
@@ -299,7 +310,11 @@ static blk_qc_t md_make_request(struct request_queue *q, struct bio *bio)
sectors = bio_sectors(bio);
/* bio could be mergeable after passing to underlayer */
bio->bi_opf &= ~REQ_NOMERGE;
- mddev->pers->make_request(mddev, bio);
+ if (!mddev->pers->make_request(mddev, bio)) {
+ atomic_dec(&mddev->active_io);
+ wake_up(&mddev->sb_wait);
+ goto check_suspended;
+ }
cpu = part_stat_lock();
part_stat_inc(cpu, &mddev->gendisk->part0, ios[rw]);
@@ -324,6 +339,7 @@ void mddev_suspend(struct mddev *mddev)
if (mddev->suspended++)
return;
synchronize_rcu();
+ wake_up(&mddev->sb_wait);
wait_event(mddev->sb_wait, atomic_read(&mddev->active_io) == 0);
mddev->pers->quiesce(mddev, 1);
@@ -449,14 +465,6 @@ void md_flush_request(struct mddev *mddev, struct bio *bio)
}
EXPORT_SYMBOL(md_flush_request);
-void md_unplug(struct blk_plug_cb *cb, bool from_schedule)
-{
- struct mddev *mddev = cb->data;
- md_wakeup_thread(mddev->thread);
- kfree(cb);
-}
-EXPORT_SYMBOL(md_unplug);
-
static inline struct mddev *mddev_get(struct mddev *mddev)
{
atomic_inc(&mddev->active);
@@ -467,7 +475,7 @@ static void mddev_delayed_delete(struct work_struct *ws);
static void mddev_put(struct mddev *mddev)
{
- struct bio_set *bs = NULL;
+ struct bio_set *bs = NULL, *sync_bs = NULL;
if (!atomic_dec_and_lock(&mddev->active, &all_mddevs_lock))
return;
@@ -477,7 +485,9 @@ static void mddev_put(struct mddev *mddev)
* so destroy it */
list_del_init(&mddev->all_mddevs);
bs = mddev->bio_set;
+ sync_bs = mddev->sync_set;
mddev->bio_set = NULL;
+ mddev->sync_set = NULL;
if (mddev->gendisk) {
/* We did a probe so need to clean up. Call
* queue_work inside the spinlock so that
@@ -492,6 +502,8 @@ static void mddev_put(struct mddev *mddev)
spin_unlock(&all_mddevs_lock);
if (bs)
bioset_free(bs);
+ if (sync_bs)
+ bioset_free(sync_bs);
}
static void md_safemode_timeout(unsigned long data);
@@ -724,8 +736,8 @@ static void super_written(struct bio *bio)
struct md_rdev *rdev = bio->bi_private;
struct mddev *mddev = rdev->mddev;
- if (bio->bi_error) {
- pr_err("md: super_written gets error=%d\n", bio->bi_error);
+ if (bio->bi_status) {
+ pr_err("md: super_written gets error=%d\n", bio->bi_status);
md_error(mddev, rdev);
if (!test_bit(Faulty, &rdev->flags)
&& (bio->bi_opf & MD_FAILFAST)) {
@@ -756,7 +768,7 @@ void md_super_write(struct mddev *mddev, struct md_rdev *rdev,
if (test_bit(Faulty, &rdev->flags))
return;
- bio = bio_alloc_mddev(GFP_NOIO, 1, mddev);
+ bio = md_bio_alloc_sync(mddev);
atomic_inc(&rdev->nr_pending);
@@ -770,7 +782,7 @@ void md_super_write(struct mddev *mddev, struct md_rdev *rdev,
test_bit(FailFast, &rdev->flags) &&
!test_bit(LastDev, &rdev->flags))
ff = MD_FAILFAST;
- bio->bi_opf = REQ_OP_WRITE | REQ_PREFLUSH | REQ_FUA | ff;
+ bio->bi_opf = REQ_OP_WRITE | REQ_SYNC | REQ_PREFLUSH | REQ_FUA | ff;
atomic_inc(&mddev->pending_writes);
submit_bio(bio);
@@ -788,7 +800,7 @@ int md_super_wait(struct mddev *mddev)
int sync_page_io(struct md_rdev *rdev, sector_t sector, int size,
struct page *page, int op, int op_flags, bool metadata_op)
{
- struct bio *bio = bio_alloc_mddev(GFP_NOIO, 1, rdev->mddev);
+ struct bio *bio = md_bio_alloc_sync(rdev->mddev);
int ret;
bio->bi_bdev = (metadata_op && rdev->meta_bdev) ?
@@ -806,7 +818,7 @@ int sync_page_io(struct md_rdev *rdev, sector_t sector, int size,
submit_bio_wait(bio);
- ret = !bio->bi_error;
+ ret = !bio->bi_status;
bio_put(bio);
return ret;
}
@@ -830,7 +842,7 @@ fail:
return -EINVAL;
}
-static int uuid_equal(mdp_super_t *sb1, mdp_super_t *sb2)
+static int md_uuid_equal(mdp_super_t *sb1, mdp_super_t *sb2)
{
return sb1->set_uuid0 == sb2->set_uuid0 &&
sb1->set_uuid1 == sb2->set_uuid1 &&
@@ -838,7 +850,7 @@ static int uuid_equal(mdp_super_t *sb1, mdp_super_t *sb2)
sb1->set_uuid3 == sb2->set_uuid3;
}
-static int sb_equal(mdp_super_t *sb1, mdp_super_t *sb2)
+static int md_sb_equal(mdp_super_t *sb1, mdp_super_t *sb2)
{
int ret;
mdp_super_t *tmp1, *tmp2;
@@ -1030,12 +1042,12 @@ static int super_90_load(struct md_rdev *rdev, struct md_rdev *refdev, int minor
} else {
__u64 ev1, ev2;
mdp_super_t *refsb = page_address(refdev->sb_page);
- if (!uuid_equal(refsb, sb)) {
+ if (!md_uuid_equal(refsb, sb)) {
pr_warn("md: %s has different UUID to %s\n",
b, bdevname(refdev->bdev,b2));
goto abort;
}
- if (!sb_equal(refsb, sb)) {
+ if (!md_sb_equal(refsb, sb)) {
pr_warn("md: %s has same UUID but different superblock to %s\n",
b, bdevname(refdev->bdev, b2));
goto abort;
@@ -1524,6 +1536,12 @@ static int super_1_load(struct md_rdev *rdev, struct md_rdev *refdev, int minor_
} else if (sb->bblog_offset != 0)
rdev->badblocks.shift = 0;
+ if (le32_to_cpu(sb->feature_map) & MD_FEATURE_PPL) {
+ rdev->ppl.offset = (__s16)le16_to_cpu(sb->ppl.offset);
+ rdev->ppl.size = le16_to_cpu(sb->ppl.size);
+ rdev->ppl.sector = rdev->sb_start + rdev->ppl.offset;
+ }
+
if (!refdev) {
ret = 1;
} else {
@@ -1636,6 +1654,13 @@ static int super_1_validate(struct mddev *mddev, struct md_rdev *rdev)
if (le32_to_cpu(sb->feature_map) & MD_FEATURE_JOURNAL)
set_bit(MD_HAS_JOURNAL, &mddev->flags);
+
+ if (le32_to_cpu(sb->feature_map) & MD_FEATURE_PPL) {
+ if (le32_to_cpu(sb->feature_map) &
+ (MD_FEATURE_BITMAP_OFFSET | MD_FEATURE_JOURNAL))
+ return -EINVAL;
+ set_bit(MD_HAS_PPL, &mddev->flags);
+ }
} else if (mddev->pers == NULL) {
/* Insist of good event counter while assembling, except for
* spares (which don't need an event count) */
@@ -1844,11 +1869,17 @@ retry:
max_dev = le32_to_cpu(sb->max_dev);
for (i=0; i<max_dev;i++)
- sb->dev_roles[i] = cpu_to_le16(MD_DISK_ROLE_FAULTY);
+ sb->dev_roles[i] = cpu_to_le16(MD_DISK_ROLE_SPARE);
if (test_bit(MD_HAS_JOURNAL, &mddev->flags))
sb->feature_map |= cpu_to_le32(MD_FEATURE_JOURNAL);
+ if (test_bit(MD_HAS_PPL, &mddev->flags)) {
+ sb->feature_map |= cpu_to_le32(MD_FEATURE_PPL);
+ sb->ppl.offset = cpu_to_le16(rdev->ppl.offset);
+ sb->ppl.size = cpu_to_le16(rdev->ppl.size);
+ }
+
rdev_for_each(rdev2, mddev) {
i = rdev2->desc_nr;
if (test_bit(Faulty, &rdev2->flags))
@@ -1896,7 +1927,7 @@ super_1_rdev_size_change(struct md_rdev *rdev, sector_t num_sectors)
}
sb = page_address(rdev->sb_page);
sb->data_size = cpu_to_le64(num_sectors);
- sb->super_offset = rdev->sb_start;
+ sb->super_offset = cpu_to_le64(rdev->sb_start);
sb->sb_csum = calc_sb_1_csum(sb);
do {
md_super_write(rdev->mddev, rdev, rdev->sb_start, rdev->sb_size,
@@ -2089,6 +2120,10 @@ static int bind_rdev_to_array(struct md_rdev *rdev, struct mddev *mddev)
if (find_rdev(mddev, rdev->bdev->bd_dev))
return -EEXIST;
+ if ((bdev_read_only(rdev->bdev) || bdev_read_only(rdev->meta_bdev)) &&
+ mddev->pers)
+ return -EROFS;
+
/* make sure rdev->sectors exceeds mddev->dev_sectors */
if (!test_bit(Journal, &rdev->flags) &&
rdev->sectors &&
@@ -2250,6 +2285,33 @@ static void export_array(struct mddev *mddev)
mddev->major_version = 0;
}
+static bool set_in_sync(struct mddev *mddev)
+{
+ WARN_ON_ONCE(!spin_is_locked(&mddev->lock));
+ if (!mddev->in_sync) {
+ mddev->sync_checkers++;
+ spin_unlock(&mddev->lock);
+ percpu_ref_switch_to_atomic_sync(&mddev->writes_pending);
+ spin_lock(&mddev->lock);
+ if (!mddev->in_sync &&
+ percpu_ref_is_zero(&mddev->writes_pending)) {
+ mddev->in_sync = 1;
+ /*
+ * Ensure ->in_sync is visible before we clear
+ * ->sync_checkers.
+ */
+ smp_mb();
+ set_bit(MD_SB_CHANGE_CLEAN, &mddev->sb_flags);
+ sysfs_notify_dirent_safe(mddev->sysfs_state);
+ }
+ if (--mddev->sync_checkers == 0)
+ percpu_ref_switch_to_percpu(&mddev->writes_pending);
+ }
+ if (mddev->safemode == 1)
+ mddev->safemode = 0;
+ return mddev->in_sync;
+}
+
static void sync_sbs(struct mddev *mddev, int nospares)
{
/* Update each superblock (in-memory image), but
@@ -2304,7 +2366,7 @@ static bool does_sb_need_changing(struct mddev *mddev)
/* Check if any mddev parameters have changed */
if ((mddev->dev_sectors != le64_to_cpu(sb->size)) ||
(mddev->reshape_position != le64_to_cpu(sb->reshape_position)) ||
- (mddev->layout != le64_to_cpu(sb->layout)) ||
+ (mddev->layout != le32_to_cpu(sb->layout)) ||
(mddev->raid_disks != le32_to_cpu(sb->raid_disks)) ||
(mddev->chunk_sectors != le32_to_cpu(sb->chunksize)))
return true;
@@ -3148,6 +3210,78 @@ static ssize_t ubb_store(struct md_rdev *rdev, const char *page, size_t len)
static struct rdev_sysfs_entry rdev_unack_bad_blocks =
__ATTR(unacknowledged_bad_blocks, S_IRUGO|S_IWUSR, ubb_show, ubb_store);
+static ssize_t
+ppl_sector_show(struct md_rdev *rdev, char *page)
+{
+ return sprintf(page, "%llu\n", (unsigned long long)rdev->ppl.sector);
+}
+
+static ssize_t
+ppl_sector_store(struct md_rdev *rdev, const char *buf, size_t len)
+{
+ unsigned long long sector;
+
+ if (kstrtoull(buf, 10, &sector) < 0)
+ return -EINVAL;
+ if (sector != (sector_t)sector)
+ return -EINVAL;
+
+ if (rdev->mddev->pers && test_bit(MD_HAS_PPL, &rdev->mddev->flags) &&
+ rdev->raid_disk >= 0)
+ return -EBUSY;
+
+ if (rdev->mddev->persistent) {
+ if (rdev->mddev->major_version == 0)
+ return -EINVAL;
+ if ((sector > rdev->sb_start &&
+ sector - rdev->sb_start > S16_MAX) ||
+ (sector < rdev->sb_start &&
+ rdev->sb_start - sector > -S16_MIN))
+ return -EINVAL;
+ rdev->ppl.offset = sector - rdev->sb_start;
+ } else if (!rdev->mddev->external) {
+ return -EBUSY;
+ }
+ rdev->ppl.sector = sector;
+ return len;
+}
+
+static struct rdev_sysfs_entry rdev_ppl_sector =
+__ATTR(ppl_sector, S_IRUGO|S_IWUSR, ppl_sector_show, ppl_sector_store);
+
+static ssize_t
+ppl_size_show(struct md_rdev *rdev, char *page)
+{
+ return sprintf(page, "%u\n", rdev->ppl.size);
+}
+
+static ssize_t
+ppl_size_store(struct md_rdev *rdev, const char *buf, size_t len)
+{
+ unsigned int size;
+
+ if (kstrtouint(buf, 10, &size) < 0)
+ return -EINVAL;
+
+ if (rdev->mddev->pers && test_bit(MD_HAS_PPL, &rdev->mddev->flags) &&
+ rdev->raid_disk >= 0)
+ return -EBUSY;
+
+ if (rdev->mddev->persistent) {
+ if (rdev->mddev->major_version == 0)
+ return -EINVAL;
+ if (size > U16_MAX)
+ return -EINVAL;
+ } else if (!rdev->mddev->external) {
+ return -EBUSY;
+ }
+ rdev->ppl.size = size;
+ return len;
+}
+
+static struct rdev_sysfs_entry rdev_ppl_size =
+__ATTR(ppl_size, S_IRUGO|S_IWUSR, ppl_size_show, ppl_size_store);
+
static struct attribute *rdev_default_attrs[] = {
&rdev_state.attr,
&rdev_errors.attr,
@@ -3158,6 +3292,8 @@ static struct attribute *rdev_default_attrs[] = {
&rdev_recovery_start.attr,
&rdev_bad_blocks.attr,
&rdev_unack_bad_blocks.attr,
+ &rdev_ppl_sector.attr,
+ &rdev_ppl_size.attr,
NULL,
};
static ssize_t
@@ -3920,6 +4056,7 @@ array_state_show(struct mddev *mddev, char *page)
st = read_auto;
break;
case 0:
+ spin_lock(&mddev->lock);
if (test_bit(MD_SB_CHANGE_PENDING, &mddev->sb_flags))
st = write_pending;
else if (mddev->in_sync)
@@ -3928,6 +4065,7 @@ array_state_show(struct mddev *mddev, char *page)
st = active_idle;
else
st = active;
+ spin_unlock(&mddev->lock);
}
else {
if (list_empty(&mddev->disks) &&
@@ -3948,7 +4086,7 @@ static int restart_array(struct mddev *mddev);
static ssize_t
array_state_store(struct mddev *mddev, const char *buf, size_t len)
{
- int err;
+ int err = 0;
enum array_state st = match_word(buf, array_states);
if (mddev->pers && (st == active || st == clean) && mddev->ro != 1) {
@@ -3961,18 +4099,9 @@ array_state_store(struct mddev *mddev, const char *buf, size_t len)
clear_bit(MD_SB_CHANGE_PENDING, &mddev->sb_flags);
md_wakeup_thread(mddev->thread);
wake_up(&mddev->sb_wait);
- err = 0;
} else /* st == clean */ {
restart_array(mddev);
- if (atomic_read(&mddev->writes_pending) == 0) {
- if (mddev->in_sync == 0) {
- mddev->in_sync = 1;
- if (mddev->safemode == 1)
- mddev->safemode = 0;
- set_bit(MD_SB_CHANGE_CLEAN, &mddev->sb_flags);
- }
- err = 0;
- } else
+ if (!set_in_sync(mddev))
err = -EBUSY;
}
if (!err)
@@ -4030,15 +4159,7 @@ array_state_store(struct mddev *mddev, const char *buf, size_t len)
if (err)
break;
spin_lock(&mddev->lock);
- if (atomic_read(&mddev->writes_pending) == 0) {
- if (mddev->in_sync == 0) {
- mddev->in_sync = 1;
- if (mddev->safemode == 1)
- mddev->safemode = 0;
- set_bit(MD_SB_CHANGE_CLEAN, &mddev->sb_flags);
- }
- err = 0;
- } else
+ if (!set_in_sync(mddev))
err = -EBUSY;
spin_unlock(&mddev->lock);
} else
@@ -4860,8 +4981,10 @@ array_size_store(struct mddev *mddev, const char *buf, size_t len)
return err;
/* cluster raid doesn't support change array_sectors */
- if (mddev_is_clustered(mddev))
+ if (mddev_is_clustered(mddev)) {
+ mddev_unlock(mddev);
return -EINVAL;
+ }
if (strncmp(buf, "default", 7) == 0) {
if (mddev->pers)
@@ -4894,6 +5017,52 @@ static struct md_sysfs_entry md_array_size =
__ATTR(array_size, S_IRUGO|S_IWUSR, array_size_show,
array_size_store);
+static ssize_t
+consistency_policy_show(struct mddev *mddev, char *page)
+{
+ int ret;
+
+ if (test_bit(MD_HAS_JOURNAL, &mddev->flags)) {
+ ret = sprintf(page, "journal\n");
+ } else if (test_bit(MD_HAS_PPL, &mddev->flags)) {
+ ret = sprintf(page, "ppl\n");
+ } else if (mddev->bitmap) {
+ ret = sprintf(page, "bitmap\n");
+ } else if (mddev->pers) {
+ if (mddev->pers->sync_request)
+ ret = sprintf(page, "resync\n");
+ else
+ ret = sprintf(page, "none\n");
+ } else {
+ ret = sprintf(page, "unknown\n");
+ }
+
+ return ret;
+}
+
+static ssize_t
+consistency_policy_store(struct mddev *mddev, const char *buf, size_t len)
+{
+ int err = 0;
+
+ if (mddev->pers) {
+ if (mddev->pers->change_consistency_policy)
+ err = mddev->pers->change_consistency_policy(mddev, buf);
+ else
+ err = -EBUSY;
+ } else if (mddev->external && strncmp(buf, "ppl", 3) == 0) {
+ set_bit(MD_HAS_PPL, &mddev->flags);
+ } else {
+ err = -EINVAL;
+ }
+
+ return err ? err : len;
+}
+
+static struct md_sysfs_entry md_consistency_policy =
+__ATTR(consistency_policy, S_IRUGO | S_IWUSR, consistency_policy_show,
+ consistency_policy_store);
+
static struct attribute *md_default_attrs[] = {
&md_level.attr,
&md_layout.attr,
@@ -4909,6 +5078,7 @@ static struct attribute *md_default_attrs[] = {
&md_reshape_direction.attr,
&md_array_size.attr,
&max_corr_read_errors.attr,
+ &md_consistency_policy.attr,
NULL,
};
@@ -4993,6 +5163,7 @@ static void md_free(struct kobject *ko)
del_gendisk(mddev->gendisk);
put_disk(mddev->gendisk);
}
+ percpu_ref_exit(&mddev->writes_pending);
kfree(mddev);
}
@@ -5018,8 +5189,31 @@ static void mddev_delayed_delete(struct work_struct *ws)
kobject_put(&mddev->kobj);
}
+static void no_op(struct percpu_ref *r) {}
+
+int mddev_init_writes_pending(struct mddev *mddev)
+{
+ if (mddev->writes_pending.percpu_count_ptr)
+ return 0;
+ if (percpu_ref_init(&mddev->writes_pending, no_op, 0, GFP_KERNEL) < 0)
+ return -ENOMEM;
+ /* We want to start with the refcount at zero */
+ percpu_ref_put(&mddev->writes_pending);
+ return 0;
+}
+EXPORT_SYMBOL_GPL(mddev_init_writes_pending);
+
static int md_alloc(dev_t dev, char *name)
{
+ /*
+ * If dev is zero, name is the name of a device to allocate with
+ * an arbitrary minor number. It will be "md_???"
+ * If dev is non-zero it must be a device number with a MAJOR of
+ * MD_MAJOR or mdp_major. In this case, if "name" is NULL, then
+ * the device is being created by opening a node in /dev.
+ * If "name" is not NULL, the device is being created by
+ * writing to /sys/module/md_mod/parameters/new_array.
+ */
static DEFINE_MUTEX(disks_mutex);
struct mddev *mddev = mddev_find(dev);
struct gendisk *disk;
@@ -5045,7 +5239,7 @@ static int md_alloc(dev_t dev, char *name)
if (mddev->gendisk)
goto abort;
- if (name) {
+ if (name && !dev) {
/* Need to ensure that 'name' is not a duplicate.
*/
struct mddev *mddev2;
@@ -5059,6 +5253,11 @@ static int md_alloc(dev_t dev, char *name)
}
spin_unlock(&all_mddevs_lock);
}
+ if (name && dev)
+ /*
+ * Creating /dev/mdNNN via "newarray", so adjust hold_active.
+ */
+ mddev->hold_active = UNTIL_STOP;
error = -ENOMEM;
mddev->queue = blk_alloc_queue(GFP_KERNEL);
@@ -5125,38 +5324,48 @@ static int md_alloc(dev_t dev, char *name)
static struct kobject *md_probe(dev_t dev, int *part, void *data)
{
- md_alloc(dev, NULL);
+ if (create_on_open)
+ md_alloc(dev, NULL);
return NULL;
}
static int add_named_array(const char *val, struct kernel_param *kp)
{
- /* val must be "md_*" where * is not all digits.
- * We allocate an array with a large free minor number, and
+ /*
+ * val must be "md_*" or "mdNNN".
+ * For "md_*" we allocate an array with a large free minor number, and
* set the name to val. val must not already be an active name.
+ * For "mdNNN" we allocate an array with the minor number NNN
+ * which must not already be in use.
*/
int len = strlen(val);
char buf[DISK_NAME_LEN];
+ unsigned long devnum;
while (len && val[len-1] == '\n')
len--;
if (len >= DISK_NAME_LEN)
return -E2BIG;
strlcpy(buf, val, len+1);
- if (strncmp(buf, "md_", 3) != 0)
- return -EINVAL;
- return md_alloc(0, buf);
+ if (strncmp(buf, "md_", 3) == 0)
+ return md_alloc(0, buf);
+ if (strncmp(buf, "md", 2) == 0 &&
+ isdigit(buf[2]) &&
+ kstrtoul(buf+2, 10, &devnum) == 0 &&
+ devnum <= MINORMASK)
+ return md_alloc(MKDEV(MD_MAJOR, devnum), NULL);
+
+ return -EINVAL;
}
static void md_safemode_timeout(unsigned long data)
{
struct mddev *mddev = (struct mddev *) data;
- if (!atomic_read(&mddev->writes_pending)) {
- mddev->safemode = 1;
- if (mddev->external)
- sysfs_notify_dirent_safe(mddev->sysfs_state);
- }
+ mddev->safemode = 1;
+ if (mddev->external)
+ sysfs_notify_dirent_safe(mddev->sysfs_state);
+
md_wakeup_thread(mddev->thread);
}
@@ -5202,6 +5411,13 @@ int md_run(struct mddev *mddev)
continue;
sync_blockdev(rdev->bdev);
invalidate_bdev(rdev->bdev);
+ if (mddev->ro != 1 &&
+ (bdev_read_only(rdev->bdev) ||
+ bdev_read_only(rdev->meta_bdev))) {
+ mddev->ro = 1;
+ if (mddev->gendisk)
+ set_disk_ro(mddev->gendisk, 1);
+ }
/* perform some consistency tests on the device.
* We don't want the data to overlap the metadata,
@@ -5228,8 +5444,16 @@ int md_run(struct mddev *mddev)
sysfs_notify_dirent_safe(rdev->sysfs_state);
}
- if (mddev->bio_set == NULL)
- mddev->bio_set = bioset_create(BIO_POOL_SIZE, 0);
+ if (mddev->bio_set == NULL) {
+ mddev->bio_set = bioset_create(BIO_POOL_SIZE, 0, BIOSET_NEED_BVECS);
+ if (!mddev->bio_set)
+ return -ENOMEM;
+ }
+ if (mddev->sync_set == NULL) {
+ mddev->sync_set = bioset_create(BIO_POOL_SIZE, 0, BIOSET_NEED_BVECS);
+ if (!mddev->sync_set)
+ return -ENOMEM;
+ }
spin_lock(&pers_lock);
pers = find_pers(mddev->level, mddev->clevel);
@@ -5346,8 +5570,8 @@ int md_run(struct mddev *mddev)
queue_flag_set_unlocked(QUEUE_FLAG_NONROT, mddev->queue);
else
queue_flag_clear_unlocked(QUEUE_FLAG_NONROT, mddev->queue);
- mddev->queue->backing_dev_info.congested_data = mddev;
- mddev->queue->backing_dev_info.congested_fn = md_congested;
+ mddev->queue->backing_dev_info->congested_data = mddev;
+ mddev->queue->backing_dev_info->congested_fn = md_congested;
}
if (pers->sync_request) {
if (mddev->kobj.sd &&
@@ -5358,7 +5582,6 @@ int md_run(struct mddev *mddev)
} else if (mddev->ro == 2) /* auto-readonly not meaningful */
mddev->ro = 0;
- atomic_set(&mddev->writes_pending,0);
atomic_set(&mddev->max_corr_read_errors,
MD_DEFAULT_MAX_CORRECTED_READ_ERRORS);
mddev->safemode = 0;
@@ -5424,6 +5647,9 @@ out:
static int restart_array(struct mddev *mddev)
{
struct gendisk *disk = mddev->gendisk;
+ struct md_rdev *rdev;
+ bool has_journal = false;
+ bool has_readonly = false;
/* Complain if it has no devices */
if (list_empty(&mddev->disks))
@@ -5432,24 +5658,21 @@ static int restart_array(struct mddev *mddev)
return -EINVAL;
if (!mddev->ro)
return -EBUSY;
- if (test_bit(MD_HAS_JOURNAL, &mddev->flags)) {
- struct md_rdev *rdev;
- bool has_journal = false;
-
- rcu_read_lock();
- rdev_for_each_rcu(rdev, mddev) {
- if (test_bit(Journal, &rdev->flags) &&
- !test_bit(Faulty, &rdev->flags)) {
- has_journal = true;
- break;
- }
- }
- rcu_read_unlock();
+ rcu_read_lock();
+ rdev_for_each_rcu(rdev, mddev) {
+ if (test_bit(Journal, &rdev->flags) &&
+ !test_bit(Faulty, &rdev->flags))
+ has_journal = true;
+ if (bdev_read_only(rdev->bdev))
+ has_readonly = true;
+ }
+ rcu_read_unlock();
+ if (test_bit(MD_HAS_JOURNAL, &mddev->flags) && !has_journal)
/* Don't restart rw with journal missing/faulty */
- if (!has_journal)
return -EINVAL;
- }
+ if (has_readonly)
+ return -EROFS;
mddev->safemode = 0;
mddev->ro = 0;
@@ -5549,15 +5772,7 @@ EXPORT_SYMBOL_GPL(md_stop_writes);
static void mddev_detach(struct mddev *mddev)
{
- struct bitmap *bitmap = mddev->bitmap;
- /* wait for behind writes to complete */
- if (bitmap && atomic_read(&bitmap->behind_writes) > 0) {
- pr_debug("md:%s: behind writes in progress - waiting to stop.\n",
- mdname(mddev));
- /* need to kick something here to make sure I/O goes? */
- wait_event(bitmap->behind_wait,
- atomic_read(&bitmap->behind_writes) == 0);
- }
+ bitmap_wait_behind_writes(mddev);
if (mddev->pers && mddev->pers->quiesce) {
mddev->pers->quiesce(mddev, 1);
mddev->pers->quiesce(mddev, 0);
@@ -5570,6 +5785,7 @@ static void mddev_detach(struct mddev *mddev)
static void __md_stop(struct mddev *mddev)
{
struct md_personality *pers = mddev->pers;
+ bitmap_destroy(mddev);
mddev_detach(mddev);
/* Ensure ->event_work is done */
flush_workqueue(md_misc_wq);
@@ -5590,7 +5806,6 @@ void md_stop(struct mddev *mddev)
* This is called from dm-raid
*/
__md_stop(mddev);
- bitmap_destroy(mddev);
if (mddev->bio_set)
bioset_free(mddev->bio_set);
}
@@ -5704,7 +5919,7 @@ static int do_md_stop(struct mddev *mddev, int mode,
__md_stop_writes(mddev);
__md_stop(mddev);
- mddev->queue->backing_dev_info.congested_fn = NULL;
+ mddev->queue->backing_dev_info->congested_fn = NULL;
/* tell userspace to handle 'inactive' */
sysfs_notify_dirent_safe(mddev->sysfs_state);
@@ -5728,7 +5943,6 @@ static int do_md_stop(struct mddev *mddev, int mode,
if (mode == 0) {
pr_info("md: %s stopped.\n", mdname(mddev));
- bitmap_destroy(mddev);
if (mddev->bitmap_info.file) {
struct file *f = mddev->bitmap_info.file;
spin_lock(&mddev->lock);
@@ -6464,11 +6678,10 @@ static int set_array_info(struct mddev *mddev, mdu_array_info_t *info)
mddev->layout = info->layout;
mddev->chunk_sectors = info->chunk_size >> 9;
- mddev->max_disks = MD_SB_DISKS;
-
if (mddev->persistent) {
- mddev->flags = 0;
- mddev->sb_flags = 0;
+ mddev->max_disks = MD_SB_DISKS;
+ mddev->flags = 0;
+ mddev->sb_flags = 0;
}
set_bit(MD_SB_CHANGE_DEVS, &mddev->sb_flags);
@@ -6508,10 +6721,7 @@ static int update_size(struct mddev *mddev, sector_t num_sectors)
struct md_rdev *rdev;
int rv;
int fit = (num_sectors == 0);
-
- /* cluster raid doesn't support update size */
- if (mddev_is_clustered(mddev))
- return -EINVAL;
+ sector_t old_dev_sectors = mddev->dev_sectors;
if (mddev->pers->resize == NULL)
return -EINVAL;
@@ -6539,8 +6749,14 @@ static int update_size(struct mddev *mddev, sector_t num_sectors)
return -ENOSPC;
}
rv = mddev->pers->resize(mddev, num_sectors);
- if (!rv)
- revalidate_disk(mddev->gendisk);
+ if (!rv) {
+ if (mddev_is_clustered(mddev))
+ md_cluster_ops->update_size(mddev, old_dev_sectors);
+ else if (mddev->queue) {
+ set_capacity(mddev->gendisk, mddev->array_sectors);
+ revalidate_disk(mddev->gendisk);
+ }
+ }
return rv;
}
@@ -6787,6 +7003,7 @@ static int md_ioctl(struct block_device *bdev, fmode_t mode,
void __user *argp = (void __user *)arg;
struct mddev *mddev = NULL;
int ro;
+ bool did_set_md_closing = false;
if (!md_ioctl_valid(cmd))
return -ENOTTY;
@@ -6876,7 +7093,9 @@ static int md_ioctl(struct block_device *bdev, fmode_t mode,
err = -EBUSY;
goto out;
}
+ WARN_ON_ONCE(test_bit(MD_CLOSING, &mddev->flags));
set_bit(MD_CLOSING, &mddev->flags);
+ did_set_md_closing = true;
mutex_unlock(&mddev->open_mutex);
sync_blockdev(bdev);
}
@@ -7069,6 +7288,8 @@ unlock:
mddev->hold_active = 0;
mddev_unlock(mddev);
out:
+ if(did_set_md_closing)
+ clear_bit(MD_CLOSING, &mddev->flags);
return err;
}
#ifdef CONFIG_COMPAT
@@ -7219,8 +7440,8 @@ void md_wakeup_thread(struct md_thread *thread)
{
if (thread) {
pr_debug("md: waking up MD thread %s.\n", thread->tsk->comm);
- set_bit(THREAD_WAKEUP, &thread->flags);
- wake_up(&thread->wqueue);
+ if (!test_and_set_bit(THREAD_WAKEUP, &thread->flags))
+ wake_up(&thread->wqueue);
}
}
EXPORT_SYMBOL(md_wakeup_thread);
@@ -7751,12 +7972,14 @@ EXPORT_SYMBOL(md_done_sync);
* If we need to update some array metadata (e.g. 'active' flag
* in superblock) before writing, schedule a superblock update
* and wait for it to complete.
+ * A return value of 'false' means that the write wasn't recorded
+ * and cannot proceed as the array is being suspend.
*/
-void md_write_start(struct mddev *mddev, struct bio *bi)
+bool md_write_start(struct mddev *mddev, struct bio *bi)
{
int did_change = 0;
if (bio_data_dir(bi) != WRITE)
- return;
+ return true;
BUG_ON(mddev->ro == 1);
if (mddev->ro == 2) {
@@ -7767,10 +7990,13 @@ void md_write_start(struct mddev *mddev, struct bio *bi)
md_wakeup_thread(mddev->sync_thread);
did_change = 1;
}
- atomic_inc(&mddev->writes_pending);
+ rcu_read_lock();
+ percpu_ref_get(&mddev->writes_pending);
+ smp_mb(); /* Match smp_mb in set_in_sync() */
if (mddev->safemode == 1)
mddev->safemode = 0;
- if (mddev->in_sync) {
+ /* sync_checkers is always 0 when writes_pending is in per-cpu mode */
+ if (mddev->in_sync || !mddev->sync_checkers) {
spin_lock(&mddev->lock);
if (mddev->in_sync) {
mddev->in_sync = 0;
@@ -7781,22 +8007,51 @@ void md_write_start(struct mddev *mddev, struct bio *bi)
}
spin_unlock(&mddev->lock);
}
+ rcu_read_unlock();
if (did_change)
sysfs_notify_dirent_safe(mddev->sysfs_state);
wait_event(mddev->sb_wait,
- !test_bit(MD_SB_CHANGE_PENDING, &mddev->sb_flags));
+ !test_bit(MD_SB_CHANGE_PENDING, &mddev->sb_flags) && !mddev->suspended);
+ if (test_bit(MD_SB_CHANGE_PENDING, &mddev->sb_flags)) {
+ percpu_ref_put(&mddev->writes_pending);
+ return false;
+ }
+ return true;
}
EXPORT_SYMBOL(md_write_start);
+/* md_write_inc can only be called when md_write_start() has
+ * already been called at least once of the current request.
+ * It increments the counter and is useful when a single request
+ * is split into several parts. Each part causes an increment and
+ * so needs a matching md_write_end().
+ * Unlike md_write_start(), it is safe to call md_write_inc() inside
+ * a spinlocked region.
+ */
+void md_write_inc(struct mddev *mddev, struct bio *bi)
+{
+ if (bio_data_dir(bi) != WRITE)
+ return;
+ WARN_ON_ONCE(mddev->in_sync || mddev->ro);
+ percpu_ref_get(&mddev->writes_pending);
+}
+EXPORT_SYMBOL(md_write_inc);
+
void md_write_end(struct mddev *mddev)
{
- if (atomic_dec_and_test(&mddev->writes_pending)) {
- if (mddev->safemode == 2)
- md_wakeup_thread(mddev->thread);
- else if (mddev->safemode_delay)
- mod_timer(&mddev->safemode_timer, jiffies + mddev->safemode_delay);
- }
+ percpu_ref_put(&mddev->writes_pending);
+
+ if (mddev->safemode == 2)
+ md_wakeup_thread(mddev->thread);
+ else if (mddev->safemode_delay)
+ /* The roundup() ensures this only performs locking once
+ * every ->safemode_delay jiffies
+ */
+ mod_timer(&mddev->safemode_timer,
+ roundup(jiffies, mddev->safemode_delay) +
+ mddev->safemode_delay);
}
+
EXPORT_SYMBOL(md_write_end);
/* md_allow_write(mddev)
@@ -7804,18 +8059,15 @@ EXPORT_SYMBOL(md_write_end);
* may proceed without blocking. It is important to call this before
* attempting a GFP_KERNEL allocation while holding the mddev lock.
* Must be called with mddev_lock held.
- *
- * In the ->external case MD_SB_CHANGE_PENDING can not be cleared until mddev->lock
- * is dropped, so return -EAGAIN after notifying userspace.
*/
-int md_allow_write(struct mddev *mddev)
+void md_allow_write(struct mddev *mddev)
{
if (!mddev->pers)
- return 0;
+ return;
if (mddev->ro)
- return 0;
+ return;
if (!mddev->pers->sync_request)
- return 0;
+ return;
spin_lock(&mddev->lock);
if (mddev->in_sync) {
@@ -7828,13 +8080,12 @@ int md_allow_write(struct mddev *mddev)
spin_unlock(&mddev->lock);
md_update_sb(mddev, 0);
sysfs_notify_dirent_safe(mddev->sysfs_state);
+ /* wait for the dirty state to be recorded in the metadata */
+ wait_event(mddev->sb_wait,
+ !test_bit(MD_SB_CHANGE_CLEAN, &mddev->sb_flags) &&
+ !test_bit(MD_SB_CHANGE_PENDING, &mddev->sb_flags));
} else
spin_unlock(&mddev->lock);
-
- if (test_bit(MD_SB_CHANGE_PENDING, &mddev->sb_flags))
- return -EAGAIN;
- else
- return 0;
}
EXPORT_SYMBOL_GPL(md_allow_write);
@@ -8396,9 +8647,8 @@ void md_check_recovery(struct mddev *mddev)
(mddev->sb_flags & ~ (1<<MD_SB_CHANGE_PENDING)) ||
test_bit(MD_RECOVERY_NEEDED, &mddev->recovery) ||
test_bit(MD_RECOVERY_DONE, &mddev->recovery) ||
- test_bit(MD_RELOAD_SB, &mddev->flags) ||
(mddev->external == 0 && mddev->safemode == 1) ||
- (mddev->safemode == 2 && ! atomic_read(&mddev->writes_pending)
+ (mddev->safemode == 2
&& !mddev->in_sync && mddev->recovery_cp == MaxSector)
))
return;
@@ -8445,27 +8695,12 @@ void md_check_recovery(struct mddev *mddev)
rdev->raid_disk < 0)
md_kick_rdev_from_array(rdev);
}
-
- if (test_and_clear_bit(MD_RELOAD_SB, &mddev->flags))
- md_reload_sb(mddev, mddev->good_device_nr);
}
- if (!mddev->external) {
- int did_change = 0;
+ if (!mddev->external && !mddev->in_sync) {
spin_lock(&mddev->lock);
- if (mddev->safemode &&
- !atomic_read(&mddev->writes_pending) &&
- !mddev->in_sync &&
- mddev->recovery_cp == MaxSector) {
- mddev->in_sync = 1;
- did_change = 1;
- set_bit(MD_SB_CHANGE_CLEAN, &mddev->sb_flags);
- }
- if (mddev->safemode == 1)
- mddev->safemode = 0;
+ set_in_sync(mddev);
spin_unlock(&mddev->lock);
- if (did_change)
- sysfs_notify_dirent_safe(mddev->sysfs_state);
}
if (mddev->sb_flags)
@@ -8758,6 +8993,18 @@ static void check_sb_changes(struct mddev *mddev, struct md_rdev *rdev)
int role, ret;
char b[BDEVNAME_SIZE];
+ /*
+ * If size is changed in another node then we need to
+ * do resize as well.
+ */
+ if (mddev->dev_sectors != le64_to_cpu(sb->size)) {
+ ret = mddev->pers->resize(mddev, le64_to_cpu(sb->size));
+ if (ret)
+ pr_info("md-cluster: resize failed\n");
+ else
+ bitmap_update_sb(mddev->bitmap);
+ }
+
/* Check for change of roles in the active devices */
rdev_for_each(rdev2, mddev) {
if (test_bit(Faulty, &rdev2->flags))
@@ -8980,7 +9227,14 @@ static __exit void md_exit(void)
for_each_mddev(mddev, tmp) {
export_array(mddev);
+ mddev->ctime = 0;
mddev->hold_active = 0;
+ /*
+ * for_each_mddev() will call mddev_put() at the end of each
+ * iteration. As the mddev is now fully clear, this will
+ * schedule the mddev for destruction by a workqueue, and the
+ * destroy_workqueue() below will wait for that to complete.
+ */
}
destroy_workqueue(md_misc_wq);
destroy_workqueue(md_wq);
@@ -9001,6 +9255,7 @@ static int set_ro(const char *val, struct kernel_param *kp)
module_param_call(start_ro, set_ro, get_ro, NULL, S_IRUSR|S_IWUSR);
module_param(start_dirty_degraded, int, S_IRUGO|S_IWUSR);
module_param_call(new_array, add_named_array, NULL, NULL, S_IWUSR);
+module_param(create_on_open, bool, S_IRUSR|S_IWUSR);
MODULE_LICENSE("GPL");
MODULE_DESCRIPTION("MD RAID framework");
diff --git a/drivers/md/md.h b/drivers/md/md.h
index 2a514036a83d..b50eb4ac1b82 100644
--- a/drivers/md/md.h
+++ b/drivers/md/md.h
@@ -122,12 +122,21 @@ struct md_rdev {
* sysfs entry */
struct badblocks badblocks;
+
+ struct {
+ short offset; /* Offset from superblock to start of PPL.
+ * Not used by external metadata. */
+ unsigned int size; /* Size in sectors of the PPL space */
+ sector_t sector; /* First sector of the PPL space */
+ } ppl;
};
enum flag_bits {
Faulty, /* device is known to have a fault */
In_sync, /* device is in_sync with rest of array */
Bitmap_sync, /* ..actually, not quite In_sync. Need a
- * bitmap-based recovery to get fully in sync
+ * bitmap-based recovery to get fully in sync.
+ * The bit is only meaningful before device
+ * has been passed to pers->hot_add_disk.
*/
WriteMostly, /* Avoid reading if at all possible */
AutoDetected, /* added by auto-detect */
@@ -219,9 +228,6 @@ enum mddev_flags {
* it then */
MD_JOURNAL_CLEAN, /* A raid with journal is already clean */
MD_HAS_JOURNAL, /* The raid array has journal feature set */
- MD_RELOAD_SB, /* Reload the superblock because another node
- * updated it.
- */
MD_CLUSTER_RESYNC_LOCKED, /* cluster raid only, which means node
* already took resync lock, need to
* release the lock */
@@ -229,6 +235,7 @@ enum mddev_flags {
* supported as calls to md_error() will
* never cause the array to become failed.
*/
+ MD_HAS_PPL, /* The raid array has PPL feature set */
};
enum mddev_sb_flags {
@@ -404,7 +411,8 @@ struct mddev {
*/
unsigned int safemode_delay;
struct timer_list safemode_timer;
- atomic_t writes_pending;
+ struct percpu_ref writes_pending;
+ int sync_checkers; /* # of threads checking writes_pending */
struct request_queue *queue; /* for plugging ... */
struct bitmap *bitmap; /* the bitmap for the device */
@@ -438,6 +446,9 @@ struct mddev {
struct attribute_group *to_remove;
struct bio_set *bio_set;
+ struct bio_set *sync_set; /* for sync operations like
+ * metadata and bitmap writes
+ */
/* Generic flush handling.
* The last to finish preflush schedules a worker to submit
@@ -504,7 +515,7 @@ struct md_personality
int level;
struct list_head list;
struct module *owner;
- void (*make_request)(struct mddev *mddev, struct bio *bio);
+ bool (*make_request)(struct mddev *mddev, struct bio *bio);
int (*run)(struct mddev *mddev);
void (*free)(struct mddev *mddev, void *priv);
void (*status)(struct seq_file *seq, struct mddev *mddev);
@@ -540,6 +551,8 @@ struct md_personality
/* congested implements bdi.congested_fn().
* Will not be called while array is 'suspended' */
int (*congested)(struct mddev *mddev, int bits);
+ /* Changes the consistency policy of an active array. */
+ int (*change_consistency_policy)(struct mddev *mddev, const char *buf);
};
struct md_sysfs_entry {
@@ -640,7 +653,9 @@ extern void md_unregister_thread(struct md_thread **threadp);
extern void md_wakeup_thread(struct md_thread *thread);
extern void md_check_recovery(struct mddev *mddev);
extern void md_reap_sync_thread(struct mddev *mddev);
-extern void md_write_start(struct mddev *mddev, struct bio *bi);
+extern int mddev_init_writes_pending(struct mddev *mddev);
+extern bool md_write_start(struct mddev *mddev, struct bio *bi);
+extern void md_write_inc(struct mddev *mddev, struct bio *bi);
extern void md_write_end(struct mddev *mddev);
extern void md_done_sync(struct mddev *mddev, int blocks, int ok);
extern void md_error(struct mddev *mddev, struct md_rdev *rdev);
@@ -656,7 +671,7 @@ extern int sync_page_io(struct md_rdev *rdev, sector_t sector, int size,
bool metadata_op);
extern void md_do_sync(struct md_thread *thread);
extern void md_new_event(struct mddev *mddev);
-extern int md_allow_write(struct mddev *mddev);
+extern void md_allow_write(struct mddev *mddev);
extern void md_wait_for_blocked_rdev(struct md_rdev *rdev, struct mddev *mddev);
extern void md_set_array_sectors(struct mddev *mddev, sector_t array_sectors);
extern int md_check_no_bitmap(struct mddev *mddev);
@@ -673,21 +688,13 @@ extern void md_rdev_clear(struct md_rdev *rdev);
extern void mddev_suspend(struct mddev *mddev);
extern void mddev_resume(struct mddev *mddev);
-extern struct bio *bio_clone_mddev(struct bio *bio, gfp_t gfp_mask,
- struct mddev *mddev);
extern struct bio *bio_alloc_mddev(gfp_t gfp_mask, int nr_iovecs,
struct mddev *mddev);
-extern void md_unplug(struct blk_plug_cb *cb, bool from_schedule);
extern void md_reload_sb(struct mddev *mddev, int raid_disk);
extern void md_update_sb(struct mddev *mddev, int force);
extern void md_kick_rdev_from_array(struct md_rdev * rdev);
struct md_rdev *md_find_rdev_nr_rcu(struct mddev *mddev, int nr);
-static inline int mddev_check_plugged(struct mddev *mddev)
-{
- return !!blk_check_plugged(md_unplug, mddev,
- sizeof(struct blk_plug_cb));
-}
static inline void rdev_dec_pending(struct md_rdev *rdev, struct mddev *mddev)
{
@@ -710,4 +717,72 @@ static inline void mddev_clear_unsupported_flags(struct mddev *mddev,
{
mddev->flags &= ~unsupported_flags;
}
+
+static inline void mddev_check_writesame(struct mddev *mddev, struct bio *bio)
+{
+ if (bio_op(bio) == REQ_OP_WRITE_SAME &&
+ !bdev_get_queue(bio->bi_bdev)->limits.max_write_same_sectors)
+ mddev->queue->limits.max_write_same_sectors = 0;
+}
+
+static inline void mddev_check_write_zeroes(struct mddev *mddev, struct bio *bio)
+{
+ if (bio_op(bio) == REQ_OP_WRITE_ZEROES &&
+ !bdev_get_queue(bio->bi_bdev)->limits.max_write_zeroes_sectors)
+ mddev->queue->limits.max_write_zeroes_sectors = 0;
+}
+
+/* Maximum size of each resync request */
+#define RESYNC_BLOCK_SIZE (64*1024)
+#define RESYNC_PAGES ((RESYNC_BLOCK_SIZE + PAGE_SIZE-1) / PAGE_SIZE)
+
+/* for managing resync I/O pages */
+struct resync_pages {
+ unsigned idx; /* for get/put page from the pool */
+ void *raid_bio;
+ struct page *pages[RESYNC_PAGES];
+};
+
+static inline int resync_alloc_pages(struct resync_pages *rp,
+ gfp_t gfp_flags)
+{
+ int i;
+
+ for (i = 0; i < RESYNC_PAGES; i++) {
+ rp->pages[i] = alloc_page(gfp_flags);
+ if (!rp->pages[i])
+ goto out_free;
+ }
+
+ return 0;
+
+out_free:
+ while (--i >= 0)
+ put_page(rp->pages[i]);
+ return -ENOMEM;
+}
+
+static inline void resync_free_pages(struct resync_pages *rp)
+{
+ int i;
+
+ for (i = 0; i < RESYNC_PAGES; i++)
+ put_page(rp->pages[i]);
+}
+
+static inline void resync_get_all_pages(struct resync_pages *rp)
+{
+ int i;
+
+ for (i = 0; i < RESYNC_PAGES; i++)
+ get_page(rp->pages[i]);
+}
+
+static inline struct page *resync_fetch_page(struct resync_pages *rp,
+ unsigned idx)
+{
+ if (WARN_ON_ONCE(idx >= RESYNC_PAGES))
+ return NULL;
+ return rp->pages[idx];
+}
#endif /* _MD_MD_H */
diff --git a/drivers/md/multipath.c b/drivers/md/multipath.c
index aa8c4e5c1ee2..23a162ba6c56 100644
--- a/drivers/md/multipath.c
+++ b/drivers/md/multipath.c
@@ -73,12 +73,12 @@ static void multipath_reschedule_retry (struct multipath_bh *mp_bh)
* operation and are ready to return a success/failure code to the buffer
* cache layer.
*/
-static void multipath_end_bh_io (struct multipath_bh *mp_bh, int err)
+static void multipath_end_bh_io(struct multipath_bh *mp_bh, blk_status_t status)
{
struct bio *bio = mp_bh->master_bio;
struct mpconf *conf = mp_bh->mddev->private;
- bio->bi_error = err;
+ bio->bi_status = status;
bio_endio(bio);
mempool_free(mp_bh, conf->pool);
}
@@ -89,7 +89,7 @@ static void multipath_end_request(struct bio *bio)
struct mpconf *conf = mp_bh->mddev->private;
struct md_rdev *rdev = conf->multipaths[mp_bh->path].rdev;
- if (!bio->bi_error)
+ if (!bio->bi_status)
multipath_end_bh_io(mp_bh, 0);
else if (!(bio->bi_opf & REQ_RAHEAD)) {
/*
@@ -102,11 +102,11 @@ static void multipath_end_request(struct bio *bio)
(unsigned long long)bio->bi_iter.bi_sector);
multipath_reschedule_retry(mp_bh);
} else
- multipath_end_bh_io(mp_bh, bio->bi_error);
+ multipath_end_bh_io(mp_bh, bio->bi_status);
rdev_dec_pending(rdev, conf->mddev);
}
-static void multipath_make_request(struct mddev *mddev, struct bio * bio)
+static bool multipath_make_request(struct mddev *mddev, struct bio * bio)
{
struct mpconf *conf = mddev->private;
struct multipath_bh * mp_bh;
@@ -114,7 +114,7 @@ static void multipath_make_request(struct mddev *mddev, struct bio * bio)
if (unlikely(bio->bi_opf & REQ_PREFLUSH)) {
md_flush_request(mddev, bio);
- return;
+ return true;
}
mp_bh = mempool_alloc(conf->pool, GFP_NOIO);
@@ -126,7 +126,7 @@ static void multipath_make_request(struct mddev *mddev, struct bio * bio)
if (mp_bh->path < 0) {
bio_io_error(bio);
mempool_free(mp_bh, conf->pool);
- return;
+ return true;
}
multipath = conf->multipaths + mp_bh->path;
@@ -138,8 +138,10 @@ static void multipath_make_request(struct mddev *mddev, struct bio * bio)
mp_bh->bio.bi_opf |= REQ_FAILFAST_TRANSPORT;
mp_bh->bio.bi_end_io = multipath_end_request;
mp_bh->bio.bi_private = mp_bh;
+ mddev_check_writesame(mddev, &mp_bh->bio);
+ mddev_check_write_zeroes(mddev, &mp_bh->bio);
generic_make_request(&mp_bh->bio);
- return;
+ return true;
}
static void multipath_status(struct seq_file *seq, struct mddev *mddev)
@@ -169,7 +171,7 @@ static int multipath_congested(struct mddev *mddev, int bits)
if (rdev && !test_bit(Faulty, &rdev->flags)) {
struct request_queue *q = bdev_get_queue(rdev->bdev);
- ret |= bdi_congested(&q->backing_dev_info, bits);
+ ret |= bdi_congested(q->backing_dev_info, bits);
/* Just like multipath_map, we just check the
* first available device
*/
@@ -345,7 +347,7 @@ static void multipathd(struct md_thread *thread)
pr_err("multipath: %s: unrecoverable IO read error for block %llu\n",
bdevname(bio->bi_bdev,b),
(unsigned long long)bio->bi_iter.bi_sector);
- multipath_end_bh_io(mp_bh, -EIO);
+ multipath_end_bh_io(mp_bh, BLK_STS_IOERR);
} else {
pr_err("multipath: %s: redirecting sector %llu to another IO path\n",
bdevname(bio->bi_bdev,b),
diff --git a/drivers/md/persistent-data/dm-array.c b/drivers/md/persistent-data/dm-array.c
index 7938cd21fa4c..185dc60360b5 100644
--- a/drivers/md/persistent-data/dm-array.c
+++ b/drivers/md/persistent-data/dm-array.c
@@ -976,6 +976,27 @@ int dm_array_cursor_next(struct dm_array_cursor *c)
}
EXPORT_SYMBOL_GPL(dm_array_cursor_next);
+int dm_array_cursor_skip(struct dm_array_cursor *c, uint32_t count)
+{
+ int r;
+
+ do {
+ uint32_t remaining = le32_to_cpu(c->ab->nr_entries) - c->index;
+
+ if (count < remaining) {
+ c->index += count;
+ return 0;
+ }
+
+ count -= remaining;
+ r = dm_array_cursor_next(c);
+
+ } while (!r);
+
+ return r;
+}
+EXPORT_SYMBOL_GPL(dm_array_cursor_skip);
+
void dm_array_cursor_get_value(struct dm_array_cursor *c, void **value_le)
{
*value_le = element_at(c->info, c->ab, c->index);
diff --git a/drivers/md/persistent-data/dm-array.h b/drivers/md/persistent-data/dm-array.h
index 27ee49a55473..d7d2d579c662 100644
--- a/drivers/md/persistent-data/dm-array.h
+++ b/drivers/md/persistent-data/dm-array.h
@@ -207,6 +207,7 @@ void dm_array_cursor_end(struct dm_array_cursor *c);
uint32_t dm_array_cursor_index(struct dm_array_cursor *c);
int dm_array_cursor_next(struct dm_array_cursor *c);
+int dm_array_cursor_skip(struct dm_array_cursor *c, uint32_t count);
/*
* value_le is only valid while the cursor points at the current value.
diff --git a/drivers/md/persistent-data/dm-bitset.c b/drivers/md/persistent-data/dm-bitset.c
index 36f7cc2c7109..b7208d82e748 100644
--- a/drivers/md/persistent-data/dm-bitset.c
+++ b/drivers/md/persistent-data/dm-bitset.c
@@ -39,6 +39,48 @@ int dm_bitset_empty(struct dm_disk_bitset *info, dm_block_t *root)
}
EXPORT_SYMBOL_GPL(dm_bitset_empty);
+struct packer_context {
+ bit_value_fn fn;
+ unsigned nr_bits;
+ void *context;
+};
+
+static int pack_bits(uint32_t index, void *value, void *context)
+{
+ int r;
+ struct packer_context *p = context;
+ unsigned bit, nr = min(64u, p->nr_bits - (index * 64));
+ uint64_t word = 0;
+ bool bv;
+
+ for (bit = 0; bit < nr; bit++) {
+ r = p->fn(index * 64 + bit, &bv, p->context);
+ if (r)
+ return r;
+
+ if (bv)
+ set_bit(bit, (unsigned long *) &word);
+ else
+ clear_bit(bit, (unsigned long *) &word);
+ }
+
+ *((__le64 *) value) = cpu_to_le64(word);
+
+ return 0;
+}
+
+int dm_bitset_new(struct dm_disk_bitset *info, dm_block_t *root,
+ uint32_t size, bit_value_fn fn, void *context)
+{
+ struct packer_context p;
+ p.fn = fn;
+ p.nr_bits = size;
+ p.context = context;
+
+ return dm_array_new(&info->array_info, root, dm_div_up(size, 64), pack_bits, &p);
+}
+EXPORT_SYMBOL_GPL(dm_bitset_new);
+
int dm_bitset_resize(struct dm_disk_bitset *info, dm_block_t root,
uint32_t old_nr_entries, uint32_t new_nr_entries,
bool default_value, dm_block_t *new_root)
@@ -168,4 +210,108 @@ int dm_bitset_test_bit(struct dm_disk_bitset *info, dm_block_t root,
}
EXPORT_SYMBOL_GPL(dm_bitset_test_bit);
+static int cursor_next_array_entry(struct dm_bitset_cursor *c)
+{
+ int r;
+ __le64 *value;
+
+ r = dm_array_cursor_next(&c->cursor);
+ if (r)
+ return r;
+
+ dm_array_cursor_get_value(&c->cursor, (void **) &value);
+ c->array_index++;
+ c->bit_index = 0;
+ c->current_bits = le64_to_cpu(*value);
+ return 0;
+}
+
+int dm_bitset_cursor_begin(struct dm_disk_bitset *info,
+ dm_block_t root, uint32_t nr_entries,
+ struct dm_bitset_cursor *c)
+{
+ int r;
+ __le64 *value;
+
+ if (!nr_entries)
+ return -ENODATA;
+
+ c->info = info;
+ c->entries_remaining = nr_entries;
+
+ r = dm_array_cursor_begin(&info->array_info, root, &c->cursor);
+ if (r)
+ return r;
+
+ dm_array_cursor_get_value(&c->cursor, (void **) &value);
+ c->array_index = 0;
+ c->bit_index = 0;
+ c->current_bits = le64_to_cpu(*value);
+
+ return r;
+}
+EXPORT_SYMBOL_GPL(dm_bitset_cursor_begin);
+
+void dm_bitset_cursor_end(struct dm_bitset_cursor *c)
+{
+ return dm_array_cursor_end(&c->cursor);
+}
+EXPORT_SYMBOL_GPL(dm_bitset_cursor_end);
+
+int dm_bitset_cursor_next(struct dm_bitset_cursor *c)
+{
+ int r = 0;
+
+ if (!c->entries_remaining)
+ return -ENODATA;
+
+ c->entries_remaining--;
+ if (++c->bit_index > 63)
+ r = cursor_next_array_entry(c);
+
+ return r;
+}
+EXPORT_SYMBOL_GPL(dm_bitset_cursor_next);
+
+int dm_bitset_cursor_skip(struct dm_bitset_cursor *c, uint32_t count)
+{
+ int r;
+ __le64 *value;
+ uint32_t nr_array_skip;
+ uint32_t remaining_in_word = 64 - c->bit_index;
+
+ if (c->entries_remaining < count)
+ return -ENODATA;
+
+ if (count < remaining_in_word) {
+ c->bit_index += count;
+ c->entries_remaining -= count;
+ return 0;
+
+ } else {
+ c->entries_remaining -= remaining_in_word;
+ count -= remaining_in_word;
+ }
+
+ nr_array_skip = (count / 64) + 1;
+ r = dm_array_cursor_skip(&c->cursor, nr_array_skip);
+ if (r)
+ return r;
+
+ dm_array_cursor_get_value(&c->cursor, (void **) &value);
+ c->entries_remaining -= count;
+ c->array_index += nr_array_skip;
+ c->bit_index = count & 63;
+ c->current_bits = le64_to_cpu(*value);
+
+ return 0;
+}
+EXPORT_SYMBOL_GPL(dm_bitset_cursor_skip);
+
+bool dm_bitset_cursor_get_value(struct dm_bitset_cursor *c)
+{
+ return test_bit(c->bit_index, (unsigned long *) &c->current_bits);
+}
+EXPORT_SYMBOL_GPL(dm_bitset_cursor_get_value);
+
/*----------------------------------------------------------------*/
diff --git a/drivers/md/persistent-data/dm-bitset.h b/drivers/md/persistent-data/dm-bitset.h
index c2287d672ef5..df888da04ee1 100644
--- a/drivers/md/persistent-data/dm-bitset.h
+++ b/drivers/md/persistent-data/dm-bitset.h
@@ -93,6 +93,22 @@ void dm_disk_bitset_init(struct dm_transaction_manager *tm,
int dm_bitset_empty(struct dm_disk_bitset *info, dm_block_t *new_root);
/*
+ * Creates a new bitset populated with values provided by a callback
+ * function. This is more efficient than creating an empty bitset,
+ * resizing, and then setting values since that process incurs a lot of
+ * copying.
+ *
+ * info - describes the array
+ * root - the root block of the array on disk
+ * size - the number of entries in the array
+ * fn - the callback
+ * context - passed to the callback
+ */
+typedef int (*bit_value_fn)(uint32_t index, bool *value, void *context);
+int dm_bitset_new(struct dm_disk_bitset *info, dm_block_t *root,
+ uint32_t size, bit_value_fn fn, void *context);
+
+/*
* Resize the bitset.
*
* info - describes the bitset
@@ -161,6 +177,29 @@ int dm_bitset_test_bit(struct dm_disk_bitset *info, dm_block_t root,
int dm_bitset_flush(struct dm_disk_bitset *info, dm_block_t root,
dm_block_t *new_root);
+struct dm_bitset_cursor {
+ struct dm_disk_bitset *info;
+ struct dm_array_cursor cursor;
+
+ uint32_t entries_remaining;
+ uint32_t array_index;
+ uint32_t bit_index;
+ uint64_t current_bits;
+};
+
+/*
+ * Make sure you've flush any dm_disk_bitset and updated the root before
+ * using this.
+ */
+int dm_bitset_cursor_begin(struct dm_disk_bitset *info,
+ dm_block_t root, uint32_t nr_entries,
+ struct dm_bitset_cursor *c);
+void dm_bitset_cursor_end(struct dm_bitset_cursor *c);
+
+int dm_bitset_cursor_next(struct dm_bitset_cursor *c);
+int dm_bitset_cursor_skip(struct dm_bitset_cursor *c, uint32_t count);
+bool dm_bitset_cursor_get_value(struct dm_bitset_cursor *c);
+
/*----------------------------------------------------------------*/
#endif /* _LINUX_DM_BITSET_H */
diff --git a/drivers/md/persistent-data/dm-block-manager.c b/drivers/md/persistent-data/dm-block-manager.c
index a6dde7cab458..ea15d220ced7 100644
--- a/drivers/md/persistent-data/dm-block-manager.c
+++ b/drivers/md/persistent-data/dm-block-manager.c
@@ -13,6 +13,7 @@
#include <linux/rwsem.h>
#include <linux/device-mapper.h>
#include <linux/stacktrace.h>
+#include <linux/sched/task.h>
#define DM_MSG_PREFIX "block manager"
@@ -120,7 +121,7 @@ static int __check_holder(struct block_lock *lock)
static void __wait(struct waiter *w)
{
for (;;) {
- set_task_state(current, TASK_UNINTERRUPTIBLE);
+ set_current_state(TASK_UNINTERRUPTIBLE);
if (!w->task)
break;
@@ -128,7 +129,7 @@ static void __wait(struct waiter *w)
schedule();
}
- set_task_state(current, TASK_RUNNING);
+ set_current_state(TASK_RUNNING);
}
static void __wake_waiter(struct waiter *w)
@@ -377,7 +378,6 @@ struct dm_block_manager {
struct dm_block_manager *dm_block_manager_create(struct block_device *bdev,
unsigned block_size,
- unsigned cache_size,
unsigned max_held_per_thread)
{
int r;
@@ -462,7 +462,7 @@ int dm_bm_read_lock(struct dm_block_manager *bm, dm_block_t b,
int r;
p = dm_bufio_read(bm->bufio, b, (struct dm_buffer **) result);
- if (IS_ERR(p))
+ if (unlikely(IS_ERR(p)))
return PTR_ERR(p);
aux = dm_bufio_get_aux_data(to_buffer(*result));
@@ -498,7 +498,7 @@ int dm_bm_write_lock(struct dm_block_manager *bm,
return -EPERM;
p = dm_bufio_read(bm->bufio, b, (struct dm_buffer **) result);
- if (IS_ERR(p))
+ if (unlikely(IS_ERR(p)))
return PTR_ERR(p);
aux = dm_bufio_get_aux_data(to_buffer(*result));
@@ -531,7 +531,7 @@ int dm_bm_read_try_lock(struct dm_block_manager *bm,
int r;
p = dm_bufio_get(bm->bufio, b, (struct dm_buffer **) result);
- if (IS_ERR(p))
+ if (unlikely(IS_ERR(p)))
return PTR_ERR(p);
if (unlikely(!p))
return -EWOULDBLOCK;
@@ -567,7 +567,7 @@ int dm_bm_write_lock_zero(struct dm_block_manager *bm,
return -EPERM;
p = dm_bufio_new(bm->bufio, b, (struct dm_buffer **) result);
- if (IS_ERR(p))
+ if (unlikely(IS_ERR(p)))
return PTR_ERR(p);
memset(p, 0, dm_bm_block_size(bm));
diff --git a/drivers/md/persistent-data/dm-block-manager.h b/drivers/md/persistent-data/dm-block-manager.h
index 3627d1b7667a..e728937f376a 100644
--- a/drivers/md/persistent-data/dm-block-manager.h
+++ b/drivers/md/persistent-data/dm-block-manager.h
@@ -33,7 +33,7 @@ void *dm_block_data(struct dm_block *b);
struct dm_block_manager;
struct dm_block_manager *dm_block_manager_create(
struct block_device *bdev, unsigned block_size,
- unsigned cache_size, unsigned max_held_per_thread);
+ unsigned max_held_per_thread);
void dm_block_manager_destroy(struct dm_block_manager *bm);
unsigned dm_bm_block_size(struct dm_block_manager *bm);
diff --git a/drivers/md/persistent-data/dm-btree.c b/drivers/md/persistent-data/dm-btree.c
index 20a40329d84a..f21ce6a3d4cf 100644
--- a/drivers/md/persistent-data/dm-btree.c
+++ b/drivers/md/persistent-data/dm-btree.c
@@ -272,7 +272,12 @@ int dm_btree_del(struct dm_btree_info *info, dm_block_t root)
int r;
struct del_stack *s;
- s = kmalloc(sizeof(*s), GFP_NOIO);
+ /*
+ * dm_btree_del() is called via an ioctl, as such should be
+ * considered an FS op. We can't recurse back into the FS, so we
+ * allocate GFP_NOFS.
+ */
+ s = kmalloc(sizeof(*s), GFP_NOFS);
if (!s)
return -ENOMEM;
s->info = info;
@@ -897,8 +902,12 @@ static int find_key(struct ro_spine *s, dm_block_t block, bool find_highest,
else
*result_key = le64_to_cpu(ro_node(s)->keys[0]);
- if (next_block || flags & INTERNAL_NODE)
- block = value64(ro_node(s), i);
+ if (next_block || flags & INTERNAL_NODE) {
+ if (find_highest)
+ block = value64(ro_node(s), i);
+ else
+ block = value64(ro_node(s), 0);
+ }
} while (flags & INTERNAL_NODE);
@@ -1139,6 +1148,17 @@ int dm_btree_cursor_next(struct dm_btree_cursor *c)
}
EXPORT_SYMBOL_GPL(dm_btree_cursor_next);
+int dm_btree_cursor_skip(struct dm_btree_cursor *c, uint32_t count)
+{
+ int r = 0;
+
+ while (count-- && !r)
+ r = dm_btree_cursor_next(c);
+
+ return r;
+}
+EXPORT_SYMBOL_GPL(dm_btree_cursor_skip);
+
int dm_btree_cursor_get_value(struct dm_btree_cursor *c, uint64_t *key, void *value_le)
{
if (c->depth) {
diff --git a/drivers/md/persistent-data/dm-btree.h b/drivers/md/persistent-data/dm-btree.h
index db9bd26adf31..3dc5bb1a4748 100644
--- a/drivers/md/persistent-data/dm-btree.h
+++ b/drivers/md/persistent-data/dm-btree.h
@@ -209,6 +209,7 @@ int dm_btree_cursor_begin(struct dm_btree_info *info, dm_block_t root,
bool prefetch_leaves, struct dm_btree_cursor *c);
void dm_btree_cursor_end(struct dm_btree_cursor *c);
int dm_btree_cursor_next(struct dm_btree_cursor *c);
+int dm_btree_cursor_skip(struct dm_btree_cursor *c, uint32_t count);
int dm_btree_cursor_get_value(struct dm_btree_cursor *c, uint64_t *key, void *value_le);
#endif /* _LINUX_DM_BTREE_H */
diff --git a/drivers/md/persistent-data/dm-space-map-common.c b/drivers/md/persistent-data/dm-space-map-common.c
index 4c28608a0c94..829b4ce057d8 100644
--- a/drivers/md/persistent-data/dm-space-map-common.c
+++ b/drivers/md/persistent-data/dm-space-map-common.c
@@ -626,13 +626,19 @@ int sm_ll_open_metadata(struct ll_disk *ll, struct dm_transaction_manager *tm,
void *root_le, size_t len)
{
int r;
- struct disk_sm_root *smr = root_le;
+ struct disk_sm_root smr;
if (len < sizeof(struct disk_sm_root)) {
DMERR("sm_metadata root too small");
return -ENOMEM;
}
+ /*
+ * We don't know the alignment of the root_le buffer, so need to
+ * copy into a new structure.
+ */
+ memcpy(&smr, root_le, sizeof(smr));
+
r = sm_ll_init(ll, tm);
if (r < 0)
return r;
@@ -644,10 +650,10 @@ int sm_ll_open_metadata(struct ll_disk *ll, struct dm_transaction_manager *tm,
ll->max_entries = metadata_ll_max_entries;
ll->commit = metadata_ll_commit;
- ll->nr_blocks = le64_to_cpu(smr->nr_blocks);
- ll->nr_allocated = le64_to_cpu(smr->nr_allocated);
- ll->bitmap_root = le64_to_cpu(smr->bitmap_root);
- ll->ref_count_root = le64_to_cpu(smr->ref_count_root);
+ ll->nr_blocks = le64_to_cpu(smr.nr_blocks);
+ ll->nr_allocated = le64_to_cpu(smr.nr_allocated);
+ ll->bitmap_root = le64_to_cpu(smr.bitmap_root);
+ ll->ref_count_root = le64_to_cpu(smr.ref_count_root);
return ll->open_index(ll);
}
diff --git a/drivers/md/persistent-data/dm-space-map-disk.c b/drivers/md/persistent-data/dm-space-map-disk.c
index ebb280a14325..32adf6b4a9c7 100644
--- a/drivers/md/persistent-data/dm-space-map-disk.c
+++ b/drivers/md/persistent-data/dm-space-map-disk.c
@@ -142,10 +142,23 @@ static int sm_disk_inc_block(struct dm_space_map *sm, dm_block_t b)
static int sm_disk_dec_block(struct dm_space_map *sm, dm_block_t b)
{
+ int r;
+ uint32_t old_count;
enum allocation_event ev;
struct sm_disk *smd = container_of(sm, struct sm_disk, sm);
- return sm_ll_dec(&smd->ll, b, &ev);
+ r = sm_ll_dec(&smd->ll, b, &ev);
+ if (!r && (ev == SM_FREE)) {
+ /*
+ * It's only free if it's also free in the last
+ * transaction.
+ */
+ r = sm_ll_lookup(&smd->old_ll, b, &old_count);
+ if (!r && !old_count)
+ smd->nr_allocated_this_transaction--;
+ }
+
+ return r;
}
static int sm_disk_new_block(struct dm_space_map *sm, dm_block_t *b)
diff --git a/drivers/md/persistent-data/dm-space-map-metadata.c b/drivers/md/persistent-data/dm-space-map-metadata.c
index 20557e2c60c6..4aed69d9dd17 100644
--- a/drivers/md/persistent-data/dm-space-map-metadata.c
+++ b/drivers/md/persistent-data/dm-space-map-metadata.c
@@ -544,7 +544,7 @@ static int sm_metadata_copy_root(struct dm_space_map *sm, void *where_le, size_t
static int sm_metadata_extend(struct dm_space_map *sm, dm_block_t extra_blocks);
-static struct dm_space_map ops = {
+static const struct dm_space_map ops = {
.destroy = sm_metadata_destroy,
.extend = sm_metadata_extend,
.get_nr_blocks = sm_metadata_get_nr_blocks,
@@ -671,7 +671,7 @@ static int sm_bootstrap_copy_root(struct dm_space_map *sm, void *where,
return -EINVAL;
}
-static struct dm_space_map bootstrap_ops = {
+static const struct dm_space_map bootstrap_ops = {
.destroy = sm_bootstrap_destroy,
.extend = sm_bootstrap_extend,
.get_nr_blocks = sm_bootstrap_get_nr_blocks,
diff --git a/drivers/md/raid0.c b/drivers/md/raid0.c
index 848365d474f3..94d9ae9b0fd0 100644
--- a/drivers/md/raid0.c
+++ b/drivers/md/raid0.c
@@ -29,7 +29,8 @@
#define UNSUPPORTED_MDDEV_FLAGS \
((1L << MD_HAS_JOURNAL) | \
(1L << MD_JOURNAL_CLEAN) | \
- (1L << MD_FAILFAST_SUPPORTED))
+ (1L << MD_FAILFAST_SUPPORTED) |\
+ (1L << MD_HAS_PPL))
static int raid0_congested(struct mddev *mddev, int bits)
{
@@ -41,7 +42,7 @@ static int raid0_congested(struct mddev *mddev, int bits)
for (i = 0; i < raid_disks && !ret ; i++) {
struct request_queue *q = bdev_get_queue(devlist[i]->bdev);
- ret |= bdi_congested(&q->backing_dev_info, bits);
+ ret |= bdi_congested(q->backing_dev_info, bits);
}
return ret;
}
@@ -383,7 +384,8 @@ static int raid0_run(struct mddev *mddev)
blk_queue_max_hw_sectors(mddev->queue, mddev->chunk_sectors);
blk_queue_max_write_same_sectors(mddev->queue, mddev->chunk_sectors);
- blk_queue_max_discard_sectors(mddev->queue, mddev->chunk_sectors);
+ blk_queue_max_write_zeroes_sectors(mddev->queue, mddev->chunk_sectors);
+ blk_queue_max_discard_sectors(mddev->queue, UINT_MAX);
blk_queue_io_min(mddev->queue, mddev->chunk_sectors << 9);
blk_queue_io_opt(mddev->queue,
@@ -420,8 +422,8 @@ static int raid0_run(struct mddev *mddev)
*/
int stripe = mddev->raid_disks *
(mddev->chunk_sectors << 9) / PAGE_SIZE;
- if (mddev->queue->backing_dev_info.ra_pages < 2* stripe)
- mddev->queue->backing_dev_info.ra_pages = 2* stripe;
+ if (mddev->queue->backing_dev_info->ra_pages < 2* stripe)
+ mddev->queue->backing_dev_info->ra_pages = 2* stripe;
}
dump_zones(mddev);
@@ -457,55 +459,147 @@ static inline int is_io_in_chunk_boundary(struct mddev *mddev,
}
}
-static void raid0_make_request(struct mddev *mddev, struct bio *bio)
+static void raid0_handle_discard(struct mddev *mddev, struct bio *bio)
+{
+ struct r0conf *conf = mddev->private;
+ struct strip_zone *zone;
+ sector_t start = bio->bi_iter.bi_sector;
+ sector_t end;
+ unsigned int stripe_size;
+ sector_t first_stripe_index, last_stripe_index;
+ sector_t start_disk_offset;
+ unsigned int start_disk_index;
+ sector_t end_disk_offset;
+ unsigned int end_disk_index;
+ unsigned int disk;
+
+ zone = find_zone(conf, &start);
+
+ if (bio_end_sector(bio) > zone->zone_end) {
+ struct bio *split = bio_split(bio,
+ zone->zone_end - bio->bi_iter.bi_sector, GFP_NOIO,
+ mddev->bio_set);
+ bio_chain(split, bio);
+ generic_make_request(bio);
+ bio = split;
+ end = zone->zone_end;
+ } else
+ end = bio_end_sector(bio);
+
+ if (zone != conf->strip_zone)
+ end = end - zone[-1].zone_end;
+
+ /* Now start and end is the offset in zone */
+ stripe_size = zone->nb_dev * mddev->chunk_sectors;
+
+ first_stripe_index = start;
+ sector_div(first_stripe_index, stripe_size);
+ last_stripe_index = end;
+ sector_div(last_stripe_index, stripe_size);
+
+ start_disk_index = (int)(start - first_stripe_index * stripe_size) /
+ mddev->chunk_sectors;
+ start_disk_offset = ((int)(start - first_stripe_index * stripe_size) %
+ mddev->chunk_sectors) +
+ first_stripe_index * mddev->chunk_sectors;
+ end_disk_index = (int)(end - last_stripe_index * stripe_size) /
+ mddev->chunk_sectors;
+ end_disk_offset = ((int)(end - last_stripe_index * stripe_size) %
+ mddev->chunk_sectors) +
+ last_stripe_index * mddev->chunk_sectors;
+
+ for (disk = 0; disk < zone->nb_dev; disk++) {
+ sector_t dev_start, dev_end;
+ struct bio *discard_bio = NULL;
+ struct md_rdev *rdev;
+
+ if (disk < start_disk_index)
+ dev_start = (first_stripe_index + 1) *
+ mddev->chunk_sectors;
+ else if (disk > start_disk_index)
+ dev_start = first_stripe_index * mddev->chunk_sectors;
+ else
+ dev_start = start_disk_offset;
+
+ if (disk < end_disk_index)
+ dev_end = (last_stripe_index + 1) * mddev->chunk_sectors;
+ else if (disk > end_disk_index)
+ dev_end = last_stripe_index * mddev->chunk_sectors;
+ else
+ dev_end = end_disk_offset;
+
+ if (dev_end <= dev_start)
+ continue;
+
+ rdev = conf->devlist[(zone - conf->strip_zone) *
+ conf->strip_zone[0].nb_dev + disk];
+ if (__blkdev_issue_discard(rdev->bdev,
+ dev_start + zone->dev_start + rdev->data_offset,
+ dev_end - dev_start, GFP_NOIO, 0, &discard_bio) ||
+ !discard_bio)
+ continue;
+ bio_chain(discard_bio, bio);
+ if (mddev->gendisk)
+ trace_block_bio_remap(bdev_get_queue(rdev->bdev),
+ discard_bio, disk_devt(mddev->gendisk),
+ bio->bi_iter.bi_sector);
+ generic_make_request(discard_bio);
+ }
+ bio_endio(bio);
+}
+
+static bool raid0_make_request(struct mddev *mddev, struct bio *bio)
{
struct strip_zone *zone;
struct md_rdev *tmp_dev;
- struct bio *split;
+ sector_t bio_sector;
+ sector_t sector;
+ unsigned chunk_sects;
+ unsigned sectors;
if (unlikely(bio->bi_opf & REQ_PREFLUSH)) {
md_flush_request(mddev, bio);
- return;
+ return true;
+ }
+
+ if (unlikely((bio_op(bio) == REQ_OP_DISCARD))) {
+ raid0_handle_discard(mddev, bio);
+ return true;
}
- do {
- sector_t bio_sector = bio->bi_iter.bi_sector;
- sector_t sector = bio_sector;
- unsigned chunk_sects = mddev->chunk_sectors;
+ bio_sector = bio->bi_iter.bi_sector;
+ sector = bio_sector;
+ chunk_sects = mddev->chunk_sectors;
- unsigned sectors = chunk_sects -
- (likely(is_power_of_2(chunk_sects))
- ? (sector & (chunk_sects-1))
- : sector_div(sector, chunk_sects));
+ sectors = chunk_sects -
+ (likely(is_power_of_2(chunk_sects))
+ ? (sector & (chunk_sects-1))
+ : sector_div(sector, chunk_sects));
- /* Restore due to sector_div */
- sector = bio_sector;
+ /* Restore due to sector_div */
+ sector = bio_sector;
- if (sectors < bio_sectors(bio)) {
- split = bio_split(bio, sectors, GFP_NOIO, fs_bio_set);
- bio_chain(split, bio);
- } else {
- split = bio;
- }
+ if (sectors < bio_sectors(bio)) {
+ struct bio *split = bio_split(bio, sectors, GFP_NOIO, mddev->bio_set);
+ bio_chain(split, bio);
+ generic_make_request(bio);
+ bio = split;
+ }
- zone = find_zone(mddev->private, &sector);
- tmp_dev = map_sector(mddev, zone, sector, &sector);
- split->bi_bdev = tmp_dev->bdev;
- split->bi_iter.bi_sector = sector + zone->dev_start +
- tmp_dev->data_offset;
-
- if (unlikely((bio_op(split) == REQ_OP_DISCARD) &&
- !blk_queue_discard(bdev_get_queue(split->bi_bdev)))) {
- /* Just ignore it */
- bio_endio(split);
- } else {
- if (mddev->gendisk)
- trace_block_bio_remap(bdev_get_queue(split->bi_bdev),
- split, disk_devt(mddev->gendisk),
- bio_sector);
- generic_make_request(split);
- }
- } while (split != bio);
+ zone = find_zone(mddev->private, &sector);
+ tmp_dev = map_sector(mddev, zone, sector, &sector);
+ bio->bi_bdev = tmp_dev->bdev;
+ bio->bi_iter.bi_sector = sector + zone->dev_start +
+ tmp_dev->data_offset;
+
+ if (mddev->gendisk)
+ trace_block_bio_remap(bdev_get_queue(bio->bi_bdev),
+ bio, disk_devt(mddev->gendisk),
+ bio_sector);
+ mddev_check_writesame(mddev, bio);
+ mddev_check_write_zeroes(mddev, bio);
+ generic_make_request(bio);
+ return true;
}
static void raid0_status(struct seq_file *seq, struct mddev *mddev)
diff --git a/drivers/md/raid1.c b/drivers/md/raid1.c
index 7b0f647bcccb..3febfc8391fb 100644
--- a/drivers/md/raid1.c
+++ b/drivers/md/raid1.c
@@ -37,14 +37,18 @@
#include <linux/module.h>
#include <linux/seq_file.h>
#include <linux/ratelimit.h>
+#include <linux/sched/signal.h>
+
#include <trace/events/block.h>
+
#include "md.h"
#include "raid1.h"
#include "bitmap.h"
#define UNSUPPORTED_MDDEV_FLAGS \
((1L << MD_HAS_JOURNAL) | \
- (1L << MD_JOURNAL_CLEAN))
+ (1L << MD_JOURNAL_CLEAN) | \
+ (1L << MD_HAS_PPL))
/*
* Number of guaranteed r1bios in case of extreme VM load:
@@ -71,13 +75,30 @@
*/
static int max_queued_requests = 1024;
-static void allow_barrier(struct r1conf *conf, sector_t start_next_window,
- sector_t bi_sector);
-static void lower_barrier(struct r1conf *conf);
+static void allow_barrier(struct r1conf *conf, sector_t sector_nr);
+static void lower_barrier(struct r1conf *conf, sector_t sector_nr);
#define raid1_log(md, fmt, args...) \
do { if ((md)->queue) blk_add_trace_msg((md)->queue, "raid1 " fmt, ##args); } while (0)
+/*
+ * 'strct resync_pages' stores actual pages used for doing the resync
+ * IO, and it is per-bio, so make .bi_private points to it.
+ */
+static inline struct resync_pages *get_resync_pages(struct bio *bio)
+{
+ return bio->bi_private;
+}
+
+/*
+ * for resync bio, r1bio pointer can be retrieved from the per-bio
+ * 'struct resync_pages'.
+ */
+static inline struct r1bio *get_resync_r1bio(struct bio *bio)
+{
+ return get_resync_pages(bio)->raid_bio;
+}
+
static void * r1bio_pool_alloc(gfp_t gfp_flags, void *data)
{
struct pool_info *pi = data;
@@ -92,15 +113,12 @@ static void r1bio_pool_free(void *r1_bio, void *data)
kfree(r1_bio);
}
-#define RESYNC_BLOCK_SIZE (64*1024)
#define RESYNC_DEPTH 32
#define RESYNC_SECTORS (RESYNC_BLOCK_SIZE >> 9)
-#define RESYNC_PAGES ((RESYNC_BLOCK_SIZE + PAGE_SIZE-1) / PAGE_SIZE)
#define RESYNC_WINDOW (RESYNC_BLOCK_SIZE * RESYNC_DEPTH)
#define RESYNC_WINDOW_SECTORS (RESYNC_WINDOW >> 9)
#define CLUSTER_RESYNC_WINDOW (16 * RESYNC_WINDOW)
#define CLUSTER_RESYNC_WINDOW_SECTORS (CLUSTER_RESYNC_WINDOW >> 9)
-#define NEXT_NORMALIO_DISTANCE (3 * RESYNC_WINDOW_SECTORS)
static void * r1buf_pool_alloc(gfp_t gfp_flags, void *data)
{
@@ -108,12 +126,18 @@ static void * r1buf_pool_alloc(gfp_t gfp_flags, void *data)
struct r1bio *r1_bio;
struct bio *bio;
int need_pages;
- int i, j;
+ int j;
+ struct resync_pages *rps;
r1_bio = r1bio_pool_alloc(gfp_flags, pi);
if (!r1_bio)
return NULL;
+ rps = kmalloc(sizeof(struct resync_pages) * pi->raid_disks,
+ gfp_flags);
+ if (!rps)
+ goto out_free_r1bio;
+
/*
* Allocate bios : 1 for reading, n-1 for writing
*/
@@ -133,19 +157,22 @@ static void * r1buf_pool_alloc(gfp_t gfp_flags, void *data)
need_pages = pi->raid_disks;
else
need_pages = 1;
- for (j = 0; j < need_pages; j++) {
+ for (j = 0; j < pi->raid_disks; j++) {
+ struct resync_pages *rp = &rps[j];
+
bio = r1_bio->bios[j];
- bio->bi_vcnt = RESYNC_PAGES;
- if (bio_alloc_pages(bio, gfp_flags))
- goto out_free_pages;
- }
- /* If not user-requests, copy the page pointers to all bios */
- if (!test_bit(MD_RECOVERY_REQUESTED, &pi->mddev->recovery)) {
- for (i=0; i<RESYNC_PAGES ; i++)
- for (j=1; j<pi->raid_disks; j++)
- r1_bio->bios[j]->bi_io_vec[i].bv_page =
- r1_bio->bios[0]->bi_io_vec[i].bv_page;
+ if (j < need_pages) {
+ if (resync_alloc_pages(rp, gfp_flags))
+ goto out_free_pages;
+ } else {
+ memcpy(rp, &rps[0], sizeof(*rp));
+ resync_get_all_pages(rp);
+ }
+
+ rp->idx = 0;
+ rp->raid_bio = r1_bio;
+ bio->bi_private = rp;
}
r1_bio->master_bio = NULL;
@@ -154,11 +181,14 @@ static void * r1buf_pool_alloc(gfp_t gfp_flags, void *data)
out_free_pages:
while (--j >= 0)
- bio_free_pages(r1_bio->bios[j]);
+ resync_free_pages(&rps[j]);
out_free_bio:
while (++j < pi->raid_disks)
bio_put(r1_bio->bios[j]);
+ kfree(rps);
+
+out_free_r1bio:
r1bio_pool_free(r1_bio, data);
return NULL;
}
@@ -166,18 +196,18 @@ out_free_bio:
static void r1buf_pool_free(void *__r1_bio, void *data)
{
struct pool_info *pi = data;
- int i,j;
+ int i;
struct r1bio *r1bio = __r1_bio;
+ struct resync_pages *rp = NULL;
- for (i = 0; i < RESYNC_PAGES; i++)
- for (j = pi->raid_disks; j-- ;) {
- if (j == 0 ||
- r1bio->bios[j]->bi_io_vec[i].bv_page !=
- r1bio->bios[0]->bi_io_vec[i].bv_page)
- safe_put_page(r1bio->bios[j]->bi_io_vec[i].bv_page);
- }
- for (i=0 ; i < pi->raid_disks; i++)
+ for (i = pi->raid_disks; i--; ) {
+ rp = get_resync_pages(r1bio->bios[i]);
+ resync_free_pages(rp);
bio_put(r1bio->bios[i]);
+ }
+
+ /* resync pages array stored in the 1st bio's .bi_private */
+ kfree(rp);
r1bio_pool_free(r1bio, data);
}
@@ -205,6 +235,7 @@ static void free_r1bio(struct r1bio *r1_bio)
static void put_buf(struct r1bio *r1_bio)
{
struct r1conf *conf = r1_bio->mddev->private;
+ sector_t sect = r1_bio->sector;
int i;
for (i = 0; i < conf->raid_disks * 2; i++) {
@@ -215,7 +246,7 @@ static void put_buf(struct r1bio *r1_bio)
mempool_free(r1_bio, conf->r1buf_pool);
- lower_barrier(conf);
+ lower_barrier(conf, sect);
}
static void reschedule_retry(struct r1bio *r1_bio)
@@ -223,10 +254,12 @@ static void reschedule_retry(struct r1bio *r1_bio)
unsigned long flags;
struct mddev *mddev = r1_bio->mddev;
struct r1conf *conf = mddev->private;
+ int idx;
+ idx = sector_to_idx(r1_bio->sector);
spin_lock_irqsave(&conf->device_lock, flags);
list_add(&r1_bio->retry_list, &conf->retry_list);
- conf->nr_queued ++;
+ atomic_inc(&conf->nr_queued[idx]);
spin_unlock_irqrestore(&conf->device_lock, flags);
wake_up(&conf->wait_barrier);
@@ -241,36 +274,17 @@ static void reschedule_retry(struct r1bio *r1_bio)
static void call_bio_endio(struct r1bio *r1_bio)
{
struct bio *bio = r1_bio->master_bio;
- int done;
struct r1conf *conf = r1_bio->mddev->private;
- sector_t start_next_window = r1_bio->start_next_window;
- sector_t bi_sector = bio->bi_iter.bi_sector;
-
- if (bio->bi_phys_segments) {
- unsigned long flags;
- spin_lock_irqsave(&conf->device_lock, flags);
- bio->bi_phys_segments--;
- done = (bio->bi_phys_segments == 0);
- spin_unlock_irqrestore(&conf->device_lock, flags);
- /*
- * make_request() might be waiting for
- * bi_phys_segments to decrease
- */
- wake_up(&conf->wait_barrier);
- } else
- done = 1;
if (!test_bit(R1BIO_Uptodate, &r1_bio->state))
- bio->bi_error = -EIO;
+ bio->bi_status = BLK_STS_IOERR;
- if (done) {
- bio_endio(bio);
- /*
- * Wake up any possible resync thread that waits for the device
- * to go idle.
- */
- allow_barrier(conf, start_next_window, bi_sector);
- }
+ bio_endio(bio);
+ /*
+ * Wake up any possible resync thread that waits for the device
+ * to go idle.
+ */
+ allow_barrier(conf, r1_bio->sector);
}
static void raid_end_bio_io(struct r1bio *r1_bio)
@@ -321,7 +335,7 @@ static int find_bio_disk(struct r1bio *r1_bio, struct bio *bio)
static void raid1_end_read_request(struct bio *bio)
{
- int uptodate = !bio->bi_error;
+ int uptodate = !bio->bi_status;
struct r1bio *r1_bio = bio->bi_private;
struct r1conf *conf = r1_bio->mddev->private;
struct md_rdev *rdev = conf->mirrors[r1_bio->read_disk].rdev;
@@ -374,12 +388,9 @@ static void close_write(struct r1bio *r1_bio)
{
/* it really is the end of this request */
if (test_bit(R1BIO_BehindIO, &r1_bio->state)) {
- /* free extra copy of the data pages */
- int i = r1_bio->behind_page_count;
- while (i--)
- safe_put_page(r1_bio->behind_bvecs[i].bv_page);
- kfree(r1_bio->behind_bvecs);
- r1_bio->behind_bvecs = NULL;
+ bio_free_pages(r1_bio->behind_master_bio);
+ bio_put(r1_bio->behind_master_bio);
+ r1_bio->behind_master_bio = NULL;
}
/* clear the bitmap if all writes complete successfully */
bitmap_endwrite(r1_bio->mddev->bitmap, r1_bio->sector,
@@ -415,12 +426,12 @@ static void raid1_end_write_request(struct bio *bio)
struct md_rdev *rdev = conf->mirrors[mirror].rdev;
bool discard_error;
- discard_error = bio->bi_error && bio_op(bio) == REQ_OP_DISCARD;
+ discard_error = bio->bi_status && bio_op(bio) == REQ_OP_DISCARD;
/*
* 'one mirror IO has finished' event handler:
*/
- if (bio->bi_error && !discard_error) {
+ if (bio->bi_status && !discard_error) {
set_bit(WriteErrorSeen, &rdev->flags);
if (!test_and_set_bit(WantReplacement, &rdev->flags))
set_bit(MD_RECOVERY_NEEDED, &
@@ -481,6 +492,10 @@ static void raid1_end_write_request(struct bio *bio)
}
if (behind) {
+ /* we release behind master bio when all write are done */
+ if (r1_bio->behind_master_bio == bio)
+ to_put = NULL;
+
if (test_bit(WriteMostly, &rdev->flags))
atomic_dec(&r1_bio->behind_remaining);
@@ -517,6 +532,25 @@ static void raid1_end_write_request(struct bio *bio)
bio_put(to_put);
}
+static sector_t align_to_barrier_unit_end(sector_t start_sector,
+ sector_t sectors)
+{
+ sector_t len;
+
+ WARN_ON(sectors == 0);
+ /*
+ * len is the number of sectors from start_sector to end of the
+ * barrier unit which start_sector belongs to.
+ */
+ len = round_up(start_sector + 1, BARRIER_UNIT_SECTOR_SIZE) -
+ start_sector;
+
+ if (len > sectors)
+ len = sectors;
+
+ return len;
+}
+
/*
* This routine returns the disk from which the requested read should
* be done. There is a per-array 'next expected sequential IO' sector
@@ -632,8 +666,11 @@ static int read_balance(struct r1conf *conf, struct r1bio *r1_bio, int *max_sect
break;
}
continue;
- } else
+ } else {
+ if ((sectors > best_good_sectors) && (best_disk >= 0))
+ best_disk = -1;
best_good_sectors = sectors;
+ }
if (best_disk >= 0)
/* At least two disks to choose from so failfast is OK */
@@ -744,15 +781,39 @@ static int raid1_congested(struct mddev *mddev, int bits)
* non-congested targets, it can be removed
*/
if ((bits & (1 << WB_async_congested)) || 1)
- ret |= bdi_congested(&q->backing_dev_info, bits);
+ ret |= bdi_congested(q->backing_dev_info, bits);
else
- ret &= bdi_congested(&q->backing_dev_info, bits);
+ ret &= bdi_congested(q->backing_dev_info, bits);
}
}
rcu_read_unlock();
return ret;
}
+static void flush_bio_list(struct r1conf *conf, struct bio *bio)
+{
+ /* flush any pending bitmap writes to disk before proceeding w/ I/O */
+ bitmap_unplug(conf->mddev->bitmap);
+ wake_up(&conf->wait_barrier);
+
+ while (bio) { /* submit pending writes */
+ struct bio *next = bio->bi_next;
+ struct md_rdev *rdev = (void*)bio->bi_bdev;
+ bio->bi_next = NULL;
+ bio->bi_bdev = rdev->bdev;
+ if (test_bit(Faulty, &rdev->flags)) {
+ bio->bi_status = BLK_STS_IOERR;
+ bio_endio(bio);
+ } else if (unlikely((bio_op(bio) == REQ_OP_DISCARD) &&
+ !blk_queue_discard(bdev_get_queue(bio->bi_bdev))))
+ /* Just ignore it */
+ bio_endio(bio);
+ else
+ generic_make_request(bio);
+ bio = next;
+ }
+}
+
static void flush_pending_writes(struct r1conf *conf)
{
/* Any writes that have been queued but are awaiting
@@ -765,27 +826,7 @@ static void flush_pending_writes(struct r1conf *conf)
bio = bio_list_get(&conf->pending_bio_list);
conf->pending_count = 0;
spin_unlock_irq(&conf->device_lock);
- /* flush any pending bitmap writes to
- * disk before proceeding w/ I/O */
- bitmap_unplug(conf->mddev->bitmap);
- wake_up(&conf->wait_barrier);
-
- while (bio) { /* submit pending writes */
- struct bio *next = bio->bi_next;
- struct md_rdev *rdev = (void*)bio->bi_bdev;
- bio->bi_next = NULL;
- bio->bi_bdev = rdev->bdev;
- if (test_bit(Faulty, &rdev->flags)) {
- bio->bi_error = -EIO;
- bio_endio(bio);
- } else if (unlikely((bio_op(bio) == REQ_OP_DISCARD) &&
- !blk_queue_discard(bdev_get_queue(bio->bi_bdev))))
- /* Just ignore it */
- bio_endio(bio);
- else
- generic_make_request(bio);
- bio = next;
- }
+ flush_bio_list(conf, bio);
} else
spin_unlock_irq(&conf->device_lock);
}
@@ -813,168 +854,229 @@ static void flush_pending_writes(struct r1conf *conf)
*/
static void raise_barrier(struct r1conf *conf, sector_t sector_nr)
{
+ int idx = sector_to_idx(sector_nr);
+
spin_lock_irq(&conf->resync_lock);
/* Wait until no block IO is waiting */
- wait_event_lock_irq(conf->wait_barrier, !conf->nr_waiting,
+ wait_event_lock_irq(conf->wait_barrier,
+ !atomic_read(&conf->nr_waiting[idx]),
conf->resync_lock);
/* block any new IO from starting */
- conf->barrier++;
- conf->next_resync = sector_nr;
+ atomic_inc(&conf->barrier[idx]);
+ /*
+ * In raise_barrier() we firstly increase conf->barrier[idx] then
+ * check conf->nr_pending[idx]. In _wait_barrier() we firstly
+ * increase conf->nr_pending[idx] then check conf->barrier[idx].
+ * A memory barrier here to make sure conf->nr_pending[idx] won't
+ * be fetched before conf->barrier[idx] is increased. Otherwise
+ * there will be a race between raise_barrier() and _wait_barrier().
+ */
+ smp_mb__after_atomic();
/* For these conditions we must wait:
* A: while the array is in frozen state
- * B: while barrier >= RESYNC_DEPTH, meaning resync reach
- * the max count which allowed.
- * C: next_resync + RESYNC_SECTORS > start_next_window, meaning
- * next resync will reach to the window which normal bios are
- * handling.
- * D: while there are any active requests in the current window.
+ * B: while conf->nr_pending[idx] is not 0, meaning regular I/O
+ * existing in corresponding I/O barrier bucket.
+ * C: while conf->barrier[idx] >= RESYNC_DEPTH, meaning reaches
+ * max resync count which allowed on current I/O barrier bucket.
*/
wait_event_lock_irq(conf->wait_barrier,
!conf->array_frozen &&
- conf->barrier < RESYNC_DEPTH &&
- conf->current_window_requests == 0 &&
- (conf->start_next_window >=
- conf->next_resync + RESYNC_SECTORS),
+ !atomic_read(&conf->nr_pending[idx]) &&
+ atomic_read(&conf->barrier[idx]) < RESYNC_DEPTH,
conf->resync_lock);
- conf->nr_pending++;
+ atomic_inc(&conf->nr_sync_pending);
spin_unlock_irq(&conf->resync_lock);
}
-static void lower_barrier(struct r1conf *conf)
+static void lower_barrier(struct r1conf *conf, sector_t sector_nr)
{
- unsigned long flags;
- BUG_ON(conf->barrier <= 0);
- spin_lock_irqsave(&conf->resync_lock, flags);
- conf->barrier--;
- conf->nr_pending--;
- spin_unlock_irqrestore(&conf->resync_lock, flags);
+ int idx = sector_to_idx(sector_nr);
+
+ BUG_ON(atomic_read(&conf->barrier[idx]) <= 0);
+
+ atomic_dec(&conf->barrier[idx]);
+ atomic_dec(&conf->nr_sync_pending);
wake_up(&conf->wait_barrier);
}
-static bool need_to_wait_for_sync(struct r1conf *conf, struct bio *bio)
+static void _wait_barrier(struct r1conf *conf, int idx)
{
- bool wait = false;
+ /*
+ * We need to increase conf->nr_pending[idx] very early here,
+ * then raise_barrier() can be blocked when it waits for
+ * conf->nr_pending[idx] to be 0. Then we can avoid holding
+ * conf->resync_lock when there is no barrier raised in same
+ * barrier unit bucket. Also if the array is frozen, I/O
+ * should be blocked until array is unfrozen.
+ */
+ atomic_inc(&conf->nr_pending[idx]);
+ /*
+ * In _wait_barrier() we firstly increase conf->nr_pending[idx], then
+ * check conf->barrier[idx]. In raise_barrier() we firstly increase
+ * conf->barrier[idx], then check conf->nr_pending[idx]. A memory
+ * barrier is necessary here to make sure conf->barrier[idx] won't be
+ * fetched before conf->nr_pending[idx] is increased. Otherwise there
+ * will be a race between _wait_barrier() and raise_barrier().
+ */
+ smp_mb__after_atomic();
- if (conf->array_frozen || !bio)
- wait = true;
- else if (conf->barrier && bio_data_dir(bio) == WRITE) {
- if ((conf->mddev->curr_resync_completed
- >= bio_end_sector(bio)) ||
- (conf->start_next_window + NEXT_NORMALIO_DISTANCE
- <= bio->bi_iter.bi_sector))
- wait = false;
- else
- wait = true;
- }
+ /*
+ * Don't worry about checking two atomic_t variables at same time
+ * here. If during we check conf->barrier[idx], the array is
+ * frozen (conf->array_frozen is 1), and chonf->barrier[idx] is
+ * 0, it is safe to return and make the I/O continue. Because the
+ * array is frozen, all I/O returned here will eventually complete
+ * or be queued, no race will happen. See code comment in
+ * frozen_array().
+ */
+ if (!READ_ONCE(conf->array_frozen) &&
+ !atomic_read(&conf->barrier[idx]))
+ return;
- return wait;
+ /*
+ * After holding conf->resync_lock, conf->nr_pending[idx]
+ * should be decreased before waiting for barrier to drop.
+ * Otherwise, we may encounter a race condition because
+ * raise_barrer() might be waiting for conf->nr_pending[idx]
+ * to be 0 at same time.
+ */
+ spin_lock_irq(&conf->resync_lock);
+ atomic_inc(&conf->nr_waiting[idx]);
+ atomic_dec(&conf->nr_pending[idx]);
+ /*
+ * In case freeze_array() is waiting for
+ * get_unqueued_pending() == extra
+ */
+ wake_up(&conf->wait_barrier);
+ /* Wait for the barrier in same barrier unit bucket to drop. */
+ wait_event_lock_irq(conf->wait_barrier,
+ !conf->array_frozen &&
+ !atomic_read(&conf->barrier[idx]),
+ conf->resync_lock);
+ atomic_inc(&conf->nr_pending[idx]);
+ atomic_dec(&conf->nr_waiting[idx]);
+ spin_unlock_irq(&conf->resync_lock);
}
-static sector_t wait_barrier(struct r1conf *conf, struct bio *bio)
+static void wait_read_barrier(struct r1conf *conf, sector_t sector_nr)
{
- sector_t sector = 0;
+ int idx = sector_to_idx(sector_nr);
- spin_lock_irq(&conf->resync_lock);
- if (need_to_wait_for_sync(conf, bio)) {
- conf->nr_waiting++;
- /* Wait for the barrier to drop.
- * However if there are already pending
- * requests (preventing the barrier from
- * rising completely), and the
- * per-process bio queue isn't empty,
- * then don't wait, as we need to empty
- * that queue to allow conf->start_next_window
- * to increase.
- */
- raid1_log(conf->mddev, "wait barrier");
- wait_event_lock_irq(conf->wait_barrier,
- !conf->array_frozen &&
- (!conf->barrier ||
- ((conf->start_next_window <
- conf->next_resync + RESYNC_SECTORS) &&
- current->bio_list &&
- !bio_list_empty(current->bio_list))),
- conf->resync_lock);
- conf->nr_waiting--;
- }
-
- if (bio && bio_data_dir(bio) == WRITE) {
- if (bio->bi_iter.bi_sector >= conf->next_resync) {
- if (conf->start_next_window == MaxSector)
- conf->start_next_window =
- conf->next_resync +
- NEXT_NORMALIO_DISTANCE;
-
- if ((conf->start_next_window + NEXT_NORMALIO_DISTANCE)
- <= bio->bi_iter.bi_sector)
- conf->next_window_requests++;
- else
- conf->current_window_requests++;
- sector = conf->start_next_window;
- }
- }
+ /*
+ * Very similar to _wait_barrier(). The difference is, for read
+ * I/O we don't need wait for sync I/O, but if the whole array
+ * is frozen, the read I/O still has to wait until the array is
+ * unfrozen. Since there is no ordering requirement with
+ * conf->barrier[idx] here, memory barrier is unnecessary as well.
+ */
+ atomic_inc(&conf->nr_pending[idx]);
+
+ if (!READ_ONCE(conf->array_frozen))
+ return;
- conf->nr_pending++;
+ spin_lock_irq(&conf->resync_lock);
+ atomic_inc(&conf->nr_waiting[idx]);
+ atomic_dec(&conf->nr_pending[idx]);
+ /*
+ * In case freeze_array() is waiting for
+ * get_unqueued_pending() == extra
+ */
+ wake_up(&conf->wait_barrier);
+ /* Wait for array to be unfrozen */
+ wait_event_lock_irq(conf->wait_barrier,
+ !conf->array_frozen,
+ conf->resync_lock);
+ atomic_inc(&conf->nr_pending[idx]);
+ atomic_dec(&conf->nr_waiting[idx]);
spin_unlock_irq(&conf->resync_lock);
- return sector;
}
-static void allow_barrier(struct r1conf *conf, sector_t start_next_window,
- sector_t bi_sector)
+static void wait_barrier(struct r1conf *conf, sector_t sector_nr)
{
- unsigned long flags;
+ int idx = sector_to_idx(sector_nr);
- spin_lock_irqsave(&conf->resync_lock, flags);
- conf->nr_pending--;
- if (start_next_window) {
- if (start_next_window == conf->start_next_window) {
- if (conf->start_next_window + NEXT_NORMALIO_DISTANCE
- <= bi_sector)
- conf->next_window_requests--;
- else
- conf->current_window_requests--;
- } else
- conf->current_window_requests--;
-
- if (!conf->current_window_requests) {
- if (conf->next_window_requests) {
- conf->current_window_requests =
- conf->next_window_requests;
- conf->next_window_requests = 0;
- conf->start_next_window +=
- NEXT_NORMALIO_DISTANCE;
- } else
- conf->start_next_window = MaxSector;
- }
- }
- spin_unlock_irqrestore(&conf->resync_lock, flags);
+ _wait_barrier(conf, idx);
+}
+
+static void wait_all_barriers(struct r1conf *conf)
+{
+ int idx;
+
+ for (idx = 0; idx < BARRIER_BUCKETS_NR; idx++)
+ _wait_barrier(conf, idx);
+}
+
+static void _allow_barrier(struct r1conf *conf, int idx)
+{
+ atomic_dec(&conf->nr_pending[idx]);
wake_up(&conf->wait_barrier);
}
+static void allow_barrier(struct r1conf *conf, sector_t sector_nr)
+{
+ int idx = sector_to_idx(sector_nr);
+
+ _allow_barrier(conf, idx);
+}
+
+static void allow_all_barriers(struct r1conf *conf)
+{
+ int idx;
+
+ for (idx = 0; idx < BARRIER_BUCKETS_NR; idx++)
+ _allow_barrier(conf, idx);
+}
+
+/* conf->resync_lock should be held */
+static int get_unqueued_pending(struct r1conf *conf)
+{
+ int idx, ret;
+
+ ret = atomic_read(&conf->nr_sync_pending);
+ for (idx = 0; idx < BARRIER_BUCKETS_NR; idx++)
+ ret += atomic_read(&conf->nr_pending[idx]) -
+ atomic_read(&conf->nr_queued[idx]);
+
+ return ret;
+}
+
static void freeze_array(struct r1conf *conf, int extra)
{
- /* stop syncio and normal IO and wait for everything to
- * go quite.
- * We wait until nr_pending match nr_queued+extra
- * This is called in the context of one normal IO request
- * that has failed. Thus any sync request that might be pending
- * will be blocked by nr_pending, and we need to wait for
- * pending IO requests to complete or be queued for re-try.
- * Thus the number queued (nr_queued) plus this request (extra)
- * must match the number of pending IOs (nr_pending) before
- * we continue.
+ /* Stop sync I/O and normal I/O and wait for everything to
+ * go quiet.
+ * This is called in two situations:
+ * 1) management command handlers (reshape, remove disk, quiesce).
+ * 2) one normal I/O request failed.
+
+ * After array_frozen is set to 1, new sync IO will be blocked at
+ * raise_barrier(), and new normal I/O will blocked at _wait_barrier()
+ * or wait_read_barrier(). The flying I/Os will either complete or be
+ * queued. When everything goes quite, there are only queued I/Os left.
+
+ * Every flying I/O contributes to a conf->nr_pending[idx], idx is the
+ * barrier bucket index which this I/O request hits. When all sync and
+ * normal I/O are queued, sum of all conf->nr_pending[] will match sum
+ * of all conf->nr_queued[]. But normal I/O failure is an exception,
+ * in handle_read_error(), we may call freeze_array() before trying to
+ * fix the read error. In this case, the error read I/O is not queued,
+ * so get_unqueued_pending() == 1.
+ *
+ * Therefore before this function returns, we need to wait until
+ * get_unqueued_pendings(conf) gets equal to extra. For
+ * normal I/O context, extra is 1, in rested situations extra is 0.
*/
spin_lock_irq(&conf->resync_lock);
conf->array_frozen = 1;
raid1_log(conf->mddev, "wait freeze");
- wait_event_lock_irq_cmd(conf->wait_barrier,
- conf->nr_pending == conf->nr_queued+extra,
- conf->resync_lock,
- flush_pending_writes(conf));
+ wait_event_lock_irq_cmd(
+ conf->wait_barrier,
+ get_unqueued_pending(conf) == extra,
+ conf->resync_lock,
+ flush_pending_writes(conf));
spin_unlock_irq(&conf->resync_lock);
}
static void unfreeze_array(struct r1conf *conf)
@@ -982,43 +1084,53 @@ static void unfreeze_array(struct r1conf *conf)
/* reverse the effect of the freeze */
spin_lock_irq(&conf->resync_lock);
conf->array_frozen = 0;
- wake_up(&conf->wait_barrier);
spin_unlock_irq(&conf->resync_lock);
+ wake_up(&conf->wait_barrier);
}
-/* duplicate the data pages for behind I/O
- */
-static void alloc_behind_pages(struct bio *bio, struct r1bio *r1_bio)
+static struct bio *alloc_behind_master_bio(struct r1bio *r1_bio,
+ struct bio *bio)
{
- int i;
- struct bio_vec *bvec;
- struct bio_vec *bvecs = kzalloc(bio->bi_vcnt * sizeof(struct bio_vec),
- GFP_NOIO);
- if (unlikely(!bvecs))
- return;
+ int size = bio->bi_iter.bi_size;
+ unsigned vcnt = (size + PAGE_SIZE - 1) >> PAGE_SHIFT;
+ int i = 0;
+ struct bio *behind_bio = NULL;
+
+ behind_bio = bio_alloc_mddev(GFP_NOIO, vcnt, r1_bio->mddev);
+ if (!behind_bio)
+ goto fail;
+
+ /* discard op, we don't support writezero/writesame yet */
+ if (!bio_has_data(bio))
+ goto skip_copy;
+
+ while (i < vcnt && size) {
+ struct page *page;
+ int len = min_t(int, PAGE_SIZE, size);
+
+ page = alloc_page(GFP_NOIO);
+ if (unlikely(!page))
+ goto free_pages;
- bio_for_each_segment_all(bvec, bio, i) {
- bvecs[i] = *bvec;
- bvecs[i].bv_page = alloc_page(GFP_NOIO);
- if (unlikely(!bvecs[i].bv_page))
- goto do_sync_io;
- memcpy(kmap(bvecs[i].bv_page) + bvec->bv_offset,
- kmap(bvec->bv_page) + bvec->bv_offset, bvec->bv_len);
- kunmap(bvecs[i].bv_page);
- kunmap(bvec->bv_page);
- }
- r1_bio->behind_bvecs = bvecs;
- r1_bio->behind_page_count = bio->bi_vcnt;
+ bio_add_page(behind_bio, page, len, 0);
+
+ size -= len;
+ i++;
+ }
+
+ bio_copy_data(behind_bio, bio);
+skip_copy:
+ r1_bio->behind_master_bio = behind_bio;;
set_bit(R1BIO_BehindIO, &r1_bio->state);
- return;
-do_sync_io:
- for (i = 0; i < bio->bi_vcnt; i++)
- if (bvecs[i].bv_page)
- put_page(bvecs[i].bv_page);
- kfree(bvecs);
+ return behind_bio;
+
+free_pages:
pr_debug("%dB behind alloc failed, doing sync I/O\n",
bio->bi_iter.bi_size);
+ bio_free_pages(behind_bio);
+fail:
+ return behind_bio;
}
struct raid1_plug_cb {
@@ -1048,30 +1160,34 @@ static void raid1_unplug(struct blk_plug_cb *cb, bool from_schedule)
/* we aren't scheduling, so we can do the write-out directly. */
bio = bio_list_get(&plug->pending);
- bitmap_unplug(mddev->bitmap);
- wake_up(&conf->wait_barrier);
-
- while (bio) { /* submit pending writes */
- struct bio *next = bio->bi_next;
- struct md_rdev *rdev = (void*)bio->bi_bdev;
- bio->bi_next = NULL;
- bio->bi_bdev = rdev->bdev;
- if (test_bit(Faulty, &rdev->flags)) {
- bio->bi_error = -EIO;
- bio_endio(bio);
- } else if (unlikely((bio_op(bio) == REQ_OP_DISCARD) &&
- !blk_queue_discard(bdev_get_queue(bio->bi_bdev))))
- /* Just ignore it */
- bio_endio(bio);
- else
- generic_make_request(bio);
- bio = next;
- }
+ flush_bio_list(conf, bio);
kfree(plug);
}
+static void init_r1bio(struct r1bio *r1_bio, struct mddev *mddev, struct bio *bio)
+{
+ r1_bio->master_bio = bio;
+ r1_bio->sectors = bio_sectors(bio);
+ r1_bio->state = 0;
+ r1_bio->mddev = mddev;
+ r1_bio->sector = bio->bi_iter.bi_sector;
+}
+
+static inline struct r1bio *
+alloc_r1bio(struct mddev *mddev, struct bio *bio)
+{
+ struct r1conf *conf = mddev->private;
+ struct r1bio *r1_bio;
+
+ r1_bio = mempool_alloc(conf->r1bio_pool, GFP_NOIO);
+ /* Ensure no bio records IO_BLOCKED */
+ memset(r1_bio->bios, 0, conf->raid_disks * sizeof(r1_bio->bios[0]));
+ init_r1bio(r1_bio, mddev, bio);
+ return r1_bio;
+}
+
static void raid1_read_request(struct mddev *mddev, struct bio *bio,
- struct r1bio *r1_bio)
+ int max_read_sectors, struct r1bio *r1_bio)
{
struct r1conf *conf = mddev->private;
struct raid1_info *mirror;
@@ -1079,22 +1195,67 @@ static void raid1_read_request(struct mddev *mddev, struct bio *bio,
struct bitmap *bitmap = mddev->bitmap;
const int op = bio_op(bio);
const unsigned long do_sync = (bio->bi_opf & REQ_SYNC);
- int sectors_handled;
int max_sectors;
int rdisk;
+ bool print_msg = !!r1_bio;
+ char b[BDEVNAME_SIZE];
+
+ /*
+ * If r1_bio is set, we are blocking the raid1d thread
+ * so there is a tiny risk of deadlock. So ask for
+ * emergency memory if needed.
+ */
+ gfp_t gfp = r1_bio ? (GFP_NOIO | __GFP_HIGH) : GFP_NOIO;
+
+ if (print_msg) {
+ /* Need to get the block device name carefully */
+ struct md_rdev *rdev;
+ rcu_read_lock();
+ rdev = rcu_dereference(conf->mirrors[r1_bio->read_disk].rdev);
+ if (rdev)
+ bdevname(rdev->bdev, b);
+ else
+ strcpy(b, "???");
+ rcu_read_unlock();
+ }
- wait_barrier(conf, bio);
+ /*
+ * Still need barrier for READ in case that whole
+ * array is frozen.
+ */
+ wait_read_barrier(conf, bio->bi_iter.bi_sector);
-read_again:
+ if (!r1_bio)
+ r1_bio = alloc_r1bio(mddev, bio);
+ else
+ init_r1bio(r1_bio, mddev, bio);
+ r1_bio->sectors = max_read_sectors;
+
+ /*
+ * make_request() can abort the operation when read-ahead is being
+ * used and no empty request is available.
+ */
rdisk = read_balance(conf, r1_bio, &max_sectors);
if (rdisk < 0) {
/* couldn't find anywhere to read from */
+ if (print_msg) {
+ pr_crit_ratelimited("md/raid1:%s: %s: unrecoverable I/O read error for block %llu\n",
+ mdname(mddev),
+ b,
+ (unsigned long long)r1_bio->sector);
+ }
raid_end_bio_io(r1_bio);
return;
}
mirror = conf->mirrors + rdisk;
+ if (print_msg)
+ pr_info_ratelimited("md/raid1:%s: redirecting sector %llu to other mirror: %s\n",
+ mdname(mddev),
+ (unsigned long long)r1_bio->sector,
+ bdevname(mirror->rdev->bdev, b));
+
if (test_bit(WriteMostly, &mirror->rdev->flags) &&
bitmap) {
/*
@@ -1105,12 +1266,20 @@ read_again:
wait_event(bitmap->behind_wait,
atomic_read(&bitmap->behind_writes) == 0);
}
+
+ if (max_sectors < bio_sectors(bio)) {
+ struct bio *split = bio_split(bio, max_sectors,
+ gfp, conf->bio_split);
+ bio_chain(split, bio);
+ generic_make_request(bio);
+ bio = split;
+ r1_bio->master_bio = bio;
+ r1_bio->sectors = max_sectors;
+ }
+
r1_bio->read_disk = rdisk;
- r1_bio->start_next_window = 0;
- read_bio = bio_clone_mddev(bio, GFP_NOIO, mddev);
- bio_trim(read_bio, r1_bio->sector - bio->bi_iter.bi_sector,
- max_sectors);
+ read_bio = bio_clone_fast(bio, gfp, mddev->bio_set);
r1_bio->bios[rdisk] = read_bio;
@@ -1129,58 +1298,22 @@ read_again:
read_bio, disk_devt(mddev->gendisk),
r1_bio->sector);
- if (max_sectors < r1_bio->sectors) {
- /*
- * could not read all from this device, so we will need another
- * r1_bio.
- */
- sectors_handled = (r1_bio->sector + max_sectors
- - bio->bi_iter.bi_sector);
- r1_bio->sectors = max_sectors;
- spin_lock_irq(&conf->device_lock);
- if (bio->bi_phys_segments == 0)
- bio->bi_phys_segments = 2;
- else
- bio->bi_phys_segments++;
- spin_unlock_irq(&conf->device_lock);
-
- /*
- * Cannot call generic_make_request directly as that will be
- * queued in __make_request and subsequent mempool_alloc might
- * block waiting for it. So hand bio over to raid1d.
- */
- reschedule_retry(r1_bio);
-
- r1_bio = mempool_alloc(conf->r1bio_pool, GFP_NOIO);
-
- r1_bio->master_bio = bio;
- r1_bio->sectors = bio_sectors(bio) - sectors_handled;
- r1_bio->state = 0;
- r1_bio->mddev = mddev;
- r1_bio->sector = bio->bi_iter.bi_sector + sectors_handled;
- goto read_again;
- } else
- generic_make_request(read_bio);
+ generic_make_request(read_bio);
}
static void raid1_write_request(struct mddev *mddev, struct bio *bio,
- struct r1bio *r1_bio)
+ int max_write_sectors)
{
struct r1conf *conf = mddev->private;
+ struct r1bio *r1_bio;
int i, disks;
struct bitmap *bitmap = mddev->bitmap;
unsigned long flags;
- const int op = bio_op(bio);
- const unsigned long do_sync = (bio->bi_opf & REQ_SYNC);
- const unsigned long do_flush_fua = (bio->bi_opf &
- (REQ_PREFLUSH | REQ_FUA));
struct md_rdev *blocked_rdev;
struct blk_plug_cb *cb;
struct raid1_plug_cb *plug = NULL;
int first_clone;
- int sectors_handled;
int max_sectors;
- sector_t start_next_window;
/*
* Register the new request and wait if the reconstruction
@@ -1188,7 +1321,6 @@ static void raid1_write_request(struct mddev *mddev, struct bio *bio,
* Continue immediately if no resync is active currently.
*/
- md_write_start(mddev, bio); /* wait on superblock update early */
if ((bio_end_sector(bio) > mddev->suspend_lo &&
bio->bi_iter.bi_sector < mddev->suspend_hi) ||
@@ -1202,7 +1334,7 @@ static void raid1_write_request(struct mddev *mddev, struct bio *bio,
*/
DEFINE_WAIT(w);
for (;;) {
- flush_signals(current);
+ sigset_t full, old;
prepare_to_wait(&conf->wait_barrier,
&w, TASK_INTERRUPTIBLE);
if (bio_end_sector(bio) <= mddev->suspend_lo ||
@@ -1212,11 +1344,17 @@ static void raid1_write_request(struct mddev *mddev, struct bio *bio,
bio->bi_iter.bi_sector,
bio_end_sector(bio))))
break;
+ sigfillset(&full);
+ sigprocmask(SIG_BLOCK, &full, &old);
schedule();
+ sigprocmask(SIG_SETMASK, &old, NULL);
}
finish_wait(&conf->wait_barrier, &w);
}
- start_next_window = wait_barrier(conf, bio);
+ wait_barrier(conf, bio->bi_iter.bi_sector);
+
+ r1_bio = alloc_r1bio(mddev, bio);
+ r1_bio->sectors = max_write_sectors;
if (conf->pending_count >= max_queued_requests) {
md_wakeup_thread(mddev->thread);
@@ -1237,7 +1375,6 @@ static void raid1_write_request(struct mddev *mddev, struct bio *bio,
disks = conf->raid_disks * 2;
retry_write:
- r1_bio->start_next_window = start_next_window;
blocked_rdev = NULL;
rcu_read_lock();
max_sectors = r1_bio->sectors;
@@ -1304,54 +1441,38 @@ static void raid1_write_request(struct mddev *mddev, struct bio *bio,
if (unlikely(blocked_rdev)) {
/* Wait for this device to become unblocked */
int j;
- sector_t old = start_next_window;
for (j = 0; j < i; j++)
if (r1_bio->bios[j])
rdev_dec_pending(conf->mirrors[j].rdev, mddev);
r1_bio->state = 0;
- allow_barrier(conf, start_next_window, bio->bi_iter.bi_sector);
+ allow_barrier(conf, bio->bi_iter.bi_sector);
raid1_log(mddev, "wait rdev %d blocked", blocked_rdev->raid_disk);
md_wait_for_blocked_rdev(blocked_rdev, mddev);
- start_next_window = wait_barrier(conf, bio);
- /*
- * We must make sure the multi r1bios of bio have
- * the same value of bi_phys_segments
- */
- if (bio->bi_phys_segments && old &&
- old != start_next_window)
- /* Wait for the former r1bio(s) to complete */
- wait_event(conf->wait_barrier,
- bio->bi_phys_segments == 1);
+ wait_barrier(conf, bio->bi_iter.bi_sector);
goto retry_write;
}
- if (max_sectors < r1_bio->sectors) {
- /* We are splitting this write into multiple parts, so
- * we need to prepare for allocating another r1_bio.
- */
+ if (max_sectors < bio_sectors(bio)) {
+ struct bio *split = bio_split(bio, max_sectors,
+ GFP_NOIO, conf->bio_split);
+ bio_chain(split, bio);
+ generic_make_request(bio);
+ bio = split;
+ r1_bio->master_bio = bio;
r1_bio->sectors = max_sectors;
- spin_lock_irq(&conf->device_lock);
- if (bio->bi_phys_segments == 0)
- bio->bi_phys_segments = 2;
- else
- bio->bi_phys_segments++;
- spin_unlock_irq(&conf->device_lock);
}
- sectors_handled = r1_bio->sector + max_sectors - bio->bi_iter.bi_sector;
atomic_set(&r1_bio->remaining, 1);
atomic_set(&r1_bio->behind_remaining, 0);
first_clone = 1;
+
for (i = 0; i < disks; i++) {
- struct bio *mbio;
+ struct bio *mbio = NULL;
if (!r1_bio->bios[i])
continue;
- mbio = bio_clone_mddev(bio, GFP_NOIO, mddev);
- bio_trim(mbio, r1_bio->sector - bio->bi_iter.bi_sector,
- max_sectors);
if (first_clone) {
/* do behind I/O ?
@@ -1361,8 +1482,9 @@ static void raid1_write_request(struct mddev *mddev, struct bio *bio,
if (bitmap &&
(atomic_read(&bitmap->behind_writes)
< mddev->bitmap_info.max_write_behind) &&
- !waitqueue_active(&bitmap->behind_wait))
- alloc_behind_pages(mbio, r1_bio);
+ !waitqueue_active(&bitmap->behind_wait)) {
+ mbio = alloc_behind_master_bio(r1_bio, bio);
+ }
bitmap_startwrite(bitmap, r1_bio->sector,
r1_bio->sectors,
@@ -1370,15 +1492,17 @@ static void raid1_write_request(struct mddev *mddev, struct bio *bio,
&r1_bio->state));
first_clone = 0;
}
- if (r1_bio->behind_bvecs) {
- struct bio_vec *bvec;
- int j;
- /*
- * We trimmed the bio, so _all is legit
- */
- bio_for_each_segment_all(bvec, mbio, j)
- bvec->bv_page = r1_bio->behind_bvecs[j].bv_page;
+ if (!mbio) {
+ if (r1_bio->behind_master_bio)
+ mbio = bio_clone_fast(r1_bio->behind_master_bio,
+ GFP_NOIO,
+ mddev->bio_set);
+ else
+ mbio = bio_clone_fast(bio, GFP_NOIO, mddev->bio_set);
+ }
+
+ if (r1_bio->behind_master_bio) {
if (test_bit(WriteMostly, &conf->mirrors[i].rdev->flags))
atomic_inc(&r1_bio->behind_remaining);
}
@@ -1389,7 +1513,7 @@ static void raid1_write_request(struct mddev *mddev, struct bio *bio,
conf->mirrors[i].rdev->data_offset);
mbio->bi_bdev = conf->mirrors[i].rdev->bdev;
mbio->bi_end_io = raid1_end_write_request;
- bio_set_op_attrs(mbio, op, do_flush_fua | do_sync);
+ mbio->bi_opf = bio_op(bio) | (bio->bi_opf & (REQ_SYNC | REQ_FUA));
if (test_bit(FailFast, &conf->mirrors[i].rdev->flags) &&
!test_bit(WriteMostly, &conf->mirrors[i].rdev->flags) &&
conf->raid_disks - mddev->degraded > 1)
@@ -1410,33 +1534,16 @@ static void raid1_write_request(struct mddev *mddev, struct bio *bio,
plug = container_of(cb, struct raid1_plug_cb, cb);
else
plug = NULL;
- spin_lock_irqsave(&conf->device_lock, flags);
if (plug) {
bio_list_add(&plug->pending, mbio);
plug->pending_cnt++;
} else {
+ spin_lock_irqsave(&conf->device_lock, flags);
bio_list_add(&conf->pending_bio_list, mbio);
conf->pending_count++;
- }
- spin_unlock_irqrestore(&conf->device_lock, flags);
- if (!plug)
+ spin_unlock_irqrestore(&conf->device_lock, flags);
md_wakeup_thread(mddev->thread);
- }
- /* Mustn't call r1_bio_write_done before this next test,
- * as it could result in the bio being freed.
- */
- if (sectors_handled < bio_sectors(bio)) {
- r1_bio_write_done(r1_bio);
- /* We need another r1_bio. It has already been counted
- * in bio->bi_phys_segments
- */
- r1_bio = mempool_alloc(conf->r1bio_pool, GFP_NOIO);
- r1_bio->master_bio = bio;
- r1_bio->sectors = bio_sectors(bio) - sectors_handled;
- r1_bio->state = 0;
- r1_bio->mddev = mddev;
- r1_bio->sector = bio->bi_iter.bi_sector + sectors_handled;
- goto retry_write;
+ }
}
r1_bio_write_done(r1_bio);
@@ -1445,38 +1552,33 @@ static void raid1_write_request(struct mddev *mddev, struct bio *bio,
wake_up(&conf->wait_barrier);
}
-static void raid1_make_request(struct mddev *mddev, struct bio *bio)
+static bool raid1_make_request(struct mddev *mddev, struct bio *bio)
{
- struct r1conf *conf = mddev->private;
- struct r1bio *r1_bio;
-
- /*
- * make_request() can abort the operation when read-ahead is being
- * used and no empty request is available.
- *
- */
- r1_bio = mempool_alloc(conf->r1bio_pool, GFP_NOIO);
+ sector_t sectors;
- r1_bio->master_bio = bio;
- r1_bio->sectors = bio_sectors(bio);
- r1_bio->state = 0;
- r1_bio->mddev = mddev;
- r1_bio->sector = bio->bi_iter.bi_sector;
+ if (unlikely(bio->bi_opf & REQ_PREFLUSH)) {
+ md_flush_request(mddev, bio);
+ return true;
+ }
/*
- * We might need to issue multiple reads to different devices if there
- * are bad blocks around, so we keep track of the number of reads in
- * bio->bi_phys_segments. If this is 0, there is only one r1_bio and
- * no locking will be needed when requests complete. If it is
- * non-zero, then it is the number of not-completed requests.
+ * There is a limit to the maximum size, but
+ * the read/write handler might find a lower limit
+ * due to bad blocks. To avoid multiple splits,
+ * we pass the maximum number of sectors down
+ * and let the lower level perform the split.
*/
- bio->bi_phys_segments = 0;
- bio_clear_flag(bio, BIO_SEG_VALID);
+ sectors = align_to_barrier_unit_end(
+ bio->bi_iter.bi_sector, bio_sectors(bio));
if (bio_data_dir(bio) == READ)
- raid1_read_request(mddev, bio, r1_bio);
- else
- raid1_write_request(mddev, bio, r1_bio);
+ raid1_read_request(mddev, bio, sectors, NULL);
+ else {
+ if (!md_write_start(mddev,bio))
+ return false;
+ raid1_write_request(mddev, bio, sectors);
+ }
+ return true;
}
static void raid1_status(struct seq_file *seq, struct mddev *mddev)
@@ -1567,19 +1669,11 @@ static void print_conf(struct r1conf *conf)
static void close_sync(struct r1conf *conf)
{
- wait_barrier(conf, NULL);
- allow_barrier(conf, 0, 0);
+ wait_all_barriers(conf);
+ allow_all_barriers(conf);
mempool_destroy(conf->r1buf_pool);
conf->r1buf_pool = NULL;
-
- spin_lock_irq(&conf->resync_lock);
- conf->next_resync = MaxSector - 2 * NEXT_NORMALIO_DISTANCE;
- conf->start_next_window = MaxSector;
- conf->current_window_requests +=
- conf->next_window_requests;
- conf->next_window_requests = 0;
- spin_unlock_irq(&conf->resync_lock);
}
static int raid1_spare_active(struct mddev *mddev)
@@ -1746,9 +1840,9 @@ static int raid1_remove_disk(struct mddev *mddev, struct md_rdev *rdev)
p->rdev = repl;
conf->mirrors[conf->raid_disks + number].rdev = NULL;
unfreeze_array(conf);
- clear_bit(WantReplacement, &rdev->flags);
- } else
- clear_bit(WantReplacement, &rdev->flags);
+ }
+
+ clear_bit(WantReplacement, &rdev->flags);
err = md_integrity_register(mddev);
}
abort:
@@ -1759,7 +1853,7 @@ abort:
static void end_sync_read(struct bio *bio)
{
- struct r1bio *r1_bio = bio->bi_private;
+ struct r1bio *r1_bio = get_resync_r1bio(bio);
update_head_pos(r1_bio->read_disk, r1_bio);
@@ -1768,7 +1862,7 @@ static void end_sync_read(struct bio *bio)
* or re-read if the read failed.
* We don't do much here, just schedule handling by raid1d
*/
- if (!bio->bi_error)
+ if (!bio->bi_status)
set_bit(R1BIO_Uptodate, &r1_bio->state);
if (atomic_dec_and_test(&r1_bio->remaining))
@@ -1777,8 +1871,8 @@ static void end_sync_read(struct bio *bio)
static void end_sync_write(struct bio *bio)
{
- int uptodate = !bio->bi_error;
- struct r1bio *r1_bio = bio->bi_private;
+ int uptodate = !bio->bi_status;
+ struct r1bio *r1_bio = get_resync_r1bio(bio);
struct mddev *mddev = r1_bio->mddev;
struct r1conf *conf = mddev->private;
sector_t first_bad;
@@ -1857,6 +1951,7 @@ static int fix_sync_read_error(struct r1bio *r1_bio)
struct mddev *mddev = r1_bio->mddev;
struct r1conf *conf = mddev->private;
struct bio *bio = r1_bio->bios[r1_bio->read_disk];
+ struct page **pages = get_resync_pages(bio)->pages;
sector_t sect = r1_bio->sector;
int sectors = r1_bio->sectors;
int idx = 0;
@@ -1890,7 +1985,7 @@ static int fix_sync_read_error(struct r1bio *r1_bio)
*/
rdev = conf->mirrors[d].rdev;
if (sync_page_io(rdev, sect, s<<9,
- bio->bi_io_vec[idx].bv_page,
+ pages[idx],
REQ_OP_READ, 0, false)) {
success = 1;
break;
@@ -1945,7 +2040,7 @@ static int fix_sync_read_error(struct r1bio *r1_bio)
continue;
rdev = conf->mirrors[d].rdev;
if (r1_sync_page_io(rdev, sect, s,
- bio->bi_io_vec[idx].bv_page,
+ pages[idx],
WRITE) == 0) {
r1_bio->bios[d]->bi_end_io = NULL;
rdev_dec_pending(rdev, mddev);
@@ -1960,7 +2055,7 @@ static int fix_sync_read_error(struct r1bio *r1_bio)
continue;
rdev = conf->mirrors[d].rdev;
if (r1_sync_page_io(rdev, sect, s,
- bio->bi_io_vec[idx].bv_page,
+ pages[idx],
READ) != 0)
atomic_add(s, &rdev->corrected_errors);
}
@@ -1969,7 +2064,7 @@ static int fix_sync_read_error(struct r1bio *r1_bio)
idx ++;
}
set_bit(R1BIO_Uptodate, &r1_bio->state);
- bio->bi_error = 0;
+ bio->bi_status = 0;
return 1;
}
@@ -1993,26 +2088,27 @@ static void process_checks(struct r1bio *r1_bio)
for (i = 0; i < conf->raid_disks * 2; i++) {
int j;
int size;
- int error;
+ blk_status_t status;
+ struct bio_vec *bi;
struct bio *b = r1_bio->bios[i];
+ struct resync_pages *rp = get_resync_pages(b);
if (b->bi_end_io != end_sync_read)
continue;
/* fixup the bio for reuse, but preserve errno */
- error = b->bi_error;
+ status = b->bi_status;
bio_reset(b);
- b->bi_error = error;
+ b->bi_status = status;
b->bi_vcnt = vcnt;
b->bi_iter.bi_size = r1_bio->sectors << 9;
b->bi_iter.bi_sector = r1_bio->sector +
conf->mirrors[i].rdev->data_offset;
b->bi_bdev = conf->mirrors[i].rdev->bdev;
b->bi_end_io = end_sync_read;
- b->bi_private = r1_bio;
+ rp->raid_bio = r1_bio;
+ b->bi_private = rp;
size = b->bi_iter.bi_size;
- for (j = 0; j < vcnt ; j++) {
- struct bio_vec *bi;
- bi = &b->bi_io_vec[j];
+ bio_for_each_segment_all(bi, b, j) {
bi->bv_offset = 0;
if (size > PAGE_SIZE)
bi->bv_len = PAGE_SIZE;
@@ -2023,7 +2119,7 @@ static void process_checks(struct r1bio *r1_bio)
}
for (primary = 0; primary < conf->raid_disks * 2; primary++)
if (r1_bio->bios[primary]->bi_end_io == end_sync_read &&
- !r1_bio->bios[primary]->bi_error) {
+ !r1_bio->bios[primary]->bi_status) {
r1_bio->bios[primary]->bi_end_io = NULL;
rdev_dec_pending(conf->mirrors[primary].rdev, mddev);
break;
@@ -2033,21 +2129,25 @@ static void process_checks(struct r1bio *r1_bio)
int j;
struct bio *pbio = r1_bio->bios[primary];
struct bio *sbio = r1_bio->bios[i];
- int error = sbio->bi_error;
+ blk_status_t status = sbio->bi_status;
+ struct page **ppages = get_resync_pages(pbio)->pages;
+ struct page **spages = get_resync_pages(sbio)->pages;
+ struct bio_vec *bi;
+ int page_len[RESYNC_PAGES] = { 0 };
if (sbio->bi_end_io != end_sync_read)
continue;
/* Now we can 'fixup' the error value */
- sbio->bi_error = 0;
+ sbio->bi_status = 0;
+
+ bio_for_each_segment_all(bi, sbio, j)
+ page_len[j] = bi->bv_len;
- if (!error) {
+ if (!status) {
for (j = vcnt; j-- ; ) {
- struct page *p, *s;
- p = pbio->bi_io_vec[j].bv_page;
- s = sbio->bi_io_vec[j].bv_page;
- if (memcmp(page_address(p),
- page_address(s),
- sbio->bi_io_vec[j].bv_len))
+ if (memcmp(page_address(ppages[j]),
+ page_address(spages[j]),
+ page_len[j]))
break;
}
} else
@@ -2055,7 +2155,7 @@ static void process_checks(struct r1bio *r1_bio)
if (j >= 0)
atomic64_add(r1_bio->sectors, &mddev->resync_mismatches);
if (j < 0 || (test_bit(MD_RECOVERY_CHECK, &mddev->recovery)
- && !error)) {
+ && !status)) {
/* No need to write to this device. */
sbio->bi_end_io = NULL;
rdev_dec_pending(conf->mirrors[i].rdev, mddev);
@@ -2071,9 +2171,7 @@ static void sync_request_write(struct mddev *mddev, struct r1bio *r1_bio)
struct r1conf *conf = mddev->private;
int i;
int disks = conf->raid_disks * 2;
- struct bio *bio, *wbio;
-
- bio = r1_bio->bios[r1_bio->read_disk];
+ struct bio *wbio;
if (!test_bit(R1BIO_Uptodate, &r1_bio->state))
/* ouch - failed to read all of that. */
@@ -2094,6 +2192,8 @@ static void sync_request_write(struct mddev *mddev, struct r1bio *r1_bio)
(i == r1_bio->read_disk ||
!test_bit(MD_RECOVERY_SYNC, &mddev->recovery))))
continue;
+ if (test_bit(Faulty, &conf->mirrors[i].rdev->flags))
+ continue;
bio_set_op_attrs(wbio, REQ_OP_WRITE, 0);
if (test_bit(FailFast, &conf->mirrors[i].rdev->flags))
@@ -2263,20 +2363,14 @@ static int narrow_write_error(struct r1bio *r1_bio, int i)
/* Write at 'sector' for 'sectors'*/
if (test_bit(R1BIO_BehindIO, &r1_bio->state)) {
- unsigned vcnt = r1_bio->behind_page_count;
- struct bio_vec *vec = r1_bio->behind_bvecs;
-
- while (!vec->bv_page) {
- vec++;
- vcnt--;
- }
-
- wbio = bio_alloc_mddev(GFP_NOIO, vcnt, mddev);
- memcpy(wbio->bi_io_vec, vec, vcnt * sizeof(struct bio_vec));
-
- wbio->bi_vcnt = vcnt;
+ wbio = bio_clone_fast(r1_bio->behind_master_bio,
+ GFP_NOIO,
+ mddev->bio_set);
+ /* We really need a _all clone */
+ wbio->bi_iter = (struct bvec_iter){ 0 };
} else {
- wbio = bio_clone_mddev(r1_bio->master_bio, GFP_NOIO, mddev);
+ wbio = bio_clone_fast(r1_bio->master_bio, GFP_NOIO,
+ mddev->bio_set);
}
bio_set_op_attrs(wbio, REQ_OP_WRITE, 0);
@@ -2310,11 +2404,11 @@ static void handle_sync_write_finished(struct r1conf *conf, struct r1bio *r1_bio
struct bio *bio = r1_bio->bios[m];
if (bio->bi_end_io == NULL)
continue;
- if (!bio->bi_error &&
+ if (!bio->bi_status &&
test_bit(R1BIO_MadeGood, &r1_bio->state)) {
rdev_clear_badblocks(rdev, r1_bio->sector, s, 0);
}
- if (bio->bi_error &&
+ if (bio->bi_status &&
test_bit(R1BIO_WriteError, &r1_bio->state)) {
if (!rdev_set_badblocks(rdev, r1_bio->sector, s, 0))
md_error(conf->mddev, rdev);
@@ -2326,8 +2420,9 @@ static void handle_sync_write_finished(struct r1conf *conf, struct r1bio *r1_bio
static void handle_write_finished(struct r1conf *conf, struct r1bio *r1_bio)
{
- int m;
+ int m, idx;
bool fail = false;
+
for (m = 0; m < conf->raid_disks * 2 ; m++)
if (r1_bio->bios[m] == IO_MADE_GOOD) {
struct md_rdev *rdev = conf->mirrors[m].rdev;
@@ -2353,8 +2448,14 @@ static void handle_write_finished(struct r1conf *conf, struct r1bio *r1_bio)
if (fail) {
spin_lock_irq(&conf->device_lock);
list_add(&r1_bio->retry_list, &conf->bio_end_io_list);
- conf->nr_queued++;
+ idx = sector_to_idx(r1_bio->sector);
+ atomic_inc(&conf->nr_queued[idx]);
spin_unlock_irq(&conf->device_lock);
+ /*
+ * In case freeze_array() is waiting for condition
+ * get_unqueued_pending() == extra to be true.
+ */
+ wake_up(&conf->wait_barrier);
md_wakeup_thread(conf->mddev->thread);
} else {
if (test_bit(R1BIO_WriteError, &r1_bio->state))
@@ -2365,11 +2466,8 @@ static void handle_write_finished(struct r1conf *conf, struct r1bio *r1_bio)
static void handle_read_error(struct r1conf *conf, struct r1bio *r1_bio)
{
- int disk;
- int max_sectors;
struct mddev *mddev = conf->mddev;
struct bio *bio;
- char b[BDEVNAME_SIZE];
struct md_rdev *rdev;
dev_t bio_dev;
sector_t bio_sector;
@@ -2385,7 +2483,6 @@ static void handle_read_error(struct r1conf *conf, struct r1bio *r1_bio)
*/
bio = r1_bio->bios[r1_bio->read_disk];
- bdevname(bio->bi_bdev, b);
bio_dev = bio->bi_bdev->bd_dev;
bio_sector = conf->mirrors[r1_bio->read_disk].rdev->data_offset + r1_bio->sector;
bio_put(bio);
@@ -2403,68 +2500,12 @@ static void handle_read_error(struct r1conf *conf, struct r1bio *r1_bio)
}
rdev_dec_pending(rdev, conf->mddev);
+ allow_barrier(conf, r1_bio->sector);
+ bio = r1_bio->master_bio;
-read_more:
- disk = read_balance(conf, r1_bio, &max_sectors);
- if (disk == -1) {
- pr_crit_ratelimited("md/raid1:%s: %s: unrecoverable I/O read error for block %llu\n",
- mdname(mddev), b, (unsigned long long)r1_bio->sector);
- raid_end_bio_io(r1_bio);
- } else {
- const unsigned long do_sync
- = r1_bio->master_bio->bi_opf & REQ_SYNC;
- r1_bio->read_disk = disk;
- bio = bio_clone_mddev(r1_bio->master_bio, GFP_NOIO, mddev);
- bio_trim(bio, r1_bio->sector - bio->bi_iter.bi_sector,
- max_sectors);
- r1_bio->bios[r1_bio->read_disk] = bio;
- rdev = conf->mirrors[disk].rdev;
- pr_info_ratelimited("md/raid1:%s: redirecting sector %llu to other mirror: %s\n",
- mdname(mddev),
- (unsigned long long)r1_bio->sector,
- bdevname(rdev->bdev, b));
- bio->bi_iter.bi_sector = r1_bio->sector + rdev->data_offset;
- bio->bi_bdev = rdev->bdev;
- bio->bi_end_io = raid1_end_read_request;
- bio_set_op_attrs(bio, REQ_OP_READ, do_sync);
- if (test_bit(FailFast, &rdev->flags) &&
- test_bit(R1BIO_FailFast, &r1_bio->state))
- bio->bi_opf |= MD_FAILFAST;
- bio->bi_private = r1_bio;
- if (max_sectors < r1_bio->sectors) {
- /* Drat - have to split this up more */
- struct bio *mbio = r1_bio->master_bio;
- int sectors_handled = (r1_bio->sector + max_sectors
- - mbio->bi_iter.bi_sector);
- r1_bio->sectors = max_sectors;
- spin_lock_irq(&conf->device_lock);
- if (mbio->bi_phys_segments == 0)
- mbio->bi_phys_segments = 2;
- else
- mbio->bi_phys_segments++;
- spin_unlock_irq(&conf->device_lock);
- trace_block_bio_remap(bdev_get_queue(bio->bi_bdev),
- bio, bio_dev, bio_sector);
- generic_make_request(bio);
- bio = NULL;
-
- r1_bio = mempool_alloc(conf->r1bio_pool, GFP_NOIO);
-
- r1_bio->master_bio = mbio;
- r1_bio->sectors = bio_sectors(mbio) - sectors_handled;
- r1_bio->state = 0;
- set_bit(R1BIO_ReadError, &r1_bio->state);
- r1_bio->mddev = mddev;
- r1_bio->sector = mbio->bi_iter.bi_sector +
- sectors_handled;
-
- goto read_more;
- } else {
- trace_block_bio_remap(bdev_get_queue(bio->bi_bdev),
- bio, bio_dev, bio_sector);
- generic_make_request(bio);
- }
- }
+ /* Reuse the old r1_bio so that the IO_BLOCKED settings are preserved */
+ r1_bio->state = 0;
+ raid1_read_request(mddev, bio, r1_bio->sectors, r1_bio);
}
static void raid1d(struct md_thread *thread)
@@ -2475,6 +2516,7 @@ static void raid1d(struct md_thread *thread)
struct r1conf *conf = mddev->private;
struct list_head *head = &conf->retry_list;
struct blk_plug plug;
+ int idx;
md_check_recovery(mddev);
@@ -2482,17 +2524,15 @@ static void raid1d(struct md_thread *thread)
!test_bit(MD_SB_CHANGE_PENDING, &mddev->sb_flags)) {
LIST_HEAD(tmp);
spin_lock_irqsave(&conf->device_lock, flags);
- if (!test_bit(MD_SB_CHANGE_PENDING, &mddev->sb_flags)) {
- while (!list_empty(&conf->bio_end_io_list)) {
- list_move(conf->bio_end_io_list.prev, &tmp);
- conf->nr_queued--;
- }
- }
+ if (!test_bit(MD_SB_CHANGE_PENDING, &mddev->sb_flags))
+ list_splice_init(&conf->bio_end_io_list, &tmp);
spin_unlock_irqrestore(&conf->device_lock, flags);
while (!list_empty(&tmp)) {
r1_bio = list_first_entry(&tmp, struct r1bio,
retry_list);
list_del(&r1_bio->retry_list);
+ idx = sector_to_idx(r1_bio->sector);
+ atomic_dec(&conf->nr_queued[idx]);
if (mddev->degraded)
set_bit(R1BIO_Degraded, &r1_bio->state);
if (test_bit(R1BIO_WriteError, &r1_bio->state))
@@ -2513,7 +2553,8 @@ static void raid1d(struct md_thread *thread)
}
r1_bio = list_entry(head->prev, struct r1bio, retry_list);
list_del(head->prev);
- conf->nr_queued--;
+ idx = sector_to_idx(r1_bio->sector);
+ atomic_dec(&conf->nr_queued[idx]);
spin_unlock_irqrestore(&conf->device_lock, flags);
mddev = r1_bio->mddev;
@@ -2530,10 +2571,7 @@ static void raid1d(struct md_thread *thread)
else if (test_bit(R1BIO_ReadError, &r1_bio->state))
handle_read_error(conf, r1_bio);
else
- /* just a partial read to be scheduled from separate
- * context
- */
- generic_make_request(r1_bio->bios[r1_bio->read_disk]);
+ WARN_ON_ONCE(1);
cond_resched();
if (mddev->sb_flags & ~(1<<MD_SB_CHANGE_PENDING))
@@ -2552,7 +2590,6 @@ static int init_resync(struct r1conf *conf)
conf->poolinfo);
if (!conf->r1buf_pool)
return -ENOMEM;
- conf->next_resync = 0;
return 0;
}
@@ -2581,6 +2618,7 @@ static sector_t raid1_sync_request(struct mddev *mddev, sector_t sector_nr,
int still_degraded = 0;
int good_sectors = RESYNC_SECTORS;
int min_bad = 0; /* number of sectors that are bad in all devices */
+ int idx = sector_to_idx(sector_nr);
if (!conf->r1buf_pool)
if (init_resync(conf))
@@ -2630,7 +2668,7 @@ static sector_t raid1_sync_request(struct mddev *mddev, sector_t sector_nr,
* If there is non-resync activity waiting for a turn, then let it
* though before starting on this new sync request.
*/
- if (conf->nr_waiting)
+ if (atomic_read(&conf->nr_waiting[idx]))
schedule_timeout_uninterruptible(1);
/* we are incrementing sector_nr below. To be safe, we check against
@@ -2657,11 +2695,12 @@ static sector_t raid1_sync_request(struct mddev *mddev, sector_t sector_nr,
r1_bio->sector = sector_nr;
r1_bio->state = 0;
set_bit(R1BIO_IsSync, &r1_bio->state);
+ /* make sure good_sectors won't go across barrier unit boundary */
+ good_sectors = align_to_barrier_unit_end(sector_nr, good_sectors);
for (i = 0; i < conf->raid_disks * 2; i++) {
struct md_rdev *rdev;
bio = r1_bio->bios[i];
- bio_reset(bio);
rdev = rcu_dereference(conf->mirrors[i].rdev);
if (rdev == NULL ||
@@ -2717,7 +2756,6 @@ static sector_t raid1_sync_request(struct mddev *mddev, sector_t sector_nr,
atomic_inc(&rdev->nr_pending);
bio->bi_iter.bi_sector = sector_nr + rdev->data_offset;
bio->bi_bdev = rdev->bdev;
- bio->bi_private = r1_bio;
if (test_bit(FailFast, &rdev->flags))
bio->bi_opf |= MD_FAILFAST;
}
@@ -2803,31 +2841,25 @@ static sector_t raid1_sync_request(struct mddev *mddev, sector_t sector_nr,
}
for (i = 0 ; i < conf->raid_disks * 2; i++) {
+ struct resync_pages *rp;
+
bio = r1_bio->bios[i];
+ rp = get_resync_pages(bio);
if (bio->bi_end_io) {
- page = bio->bi_io_vec[bio->bi_vcnt].bv_page;
- if (bio_add_page(bio, page, len, 0) == 0) {
- /* stop here */
- bio->bi_io_vec[bio->bi_vcnt].bv_page = page;
- while (i > 0) {
- i--;
- bio = r1_bio->bios[i];
- if (bio->bi_end_io==NULL)
- continue;
- /* remove last page from this bio */
- bio->bi_vcnt--;
- bio->bi_iter.bi_size -= len;
- bio_clear_flag(bio, BIO_SEG_VALID);
- }
- goto bio_full;
- }
+ page = resync_fetch_page(rp, rp->idx++);
+
+ /*
+ * won't fail because the vec table is big
+ * enough to hold all these pages
+ */
+ bio_add_page(bio, page, len, 0);
}
}
nr_sectors += len>>9;
sector_nr += len>>9;
sync_blocks -= (len>>9);
- } while (r1_bio->bios[disk]->bi_vcnt < RESYNC_PAGES);
- bio_full:
+ } while (get_resync_pages(r1_bio->bios[disk]->bi_private)->idx < RESYNC_PAGES);
+
r1_bio->sectors = nr_sectors;
if (mddev_is_clustered(mddev) &&
@@ -2887,6 +2919,26 @@ static struct r1conf *setup_conf(struct mddev *mddev)
if (!conf)
goto abort;
+ conf->nr_pending = kcalloc(BARRIER_BUCKETS_NR,
+ sizeof(atomic_t), GFP_KERNEL);
+ if (!conf->nr_pending)
+ goto abort;
+
+ conf->nr_waiting = kcalloc(BARRIER_BUCKETS_NR,
+ sizeof(atomic_t), GFP_KERNEL);
+ if (!conf->nr_waiting)
+ goto abort;
+
+ conf->nr_queued = kcalloc(BARRIER_BUCKETS_NR,
+ sizeof(atomic_t), GFP_KERNEL);
+ if (!conf->nr_queued)
+ goto abort;
+
+ conf->barrier = kcalloc(BARRIER_BUCKETS_NR,
+ sizeof(atomic_t), GFP_KERNEL);
+ if (!conf->barrier)
+ goto abort;
+
conf->mirrors = kzalloc(sizeof(struct raid1_info)
* mddev->raid_disks * 2,
GFP_KERNEL);
@@ -2907,12 +2959,15 @@ static struct r1conf *setup_conf(struct mddev *mddev)
if (!conf->r1bio_pool)
goto abort;
+ conf->bio_split = bioset_create(BIO_POOL_SIZE, 0, 0);
+ if (!conf->bio_split)
+ goto abort;
+
conf->poolinfo->mddev = mddev;
err = -EINVAL;
spin_lock_init(&conf->device_lock);
rdev_for_each(rdev, mddev) {
- struct request_queue *q;
int disk_idx = rdev->raid_disk;
if (disk_idx >= mddev->raid_disks
|| disk_idx < 0)
@@ -2925,8 +2980,6 @@ static struct r1conf *setup_conf(struct mddev *mddev)
if (disk->rdev)
goto abort;
disk->rdev = rdev;
- q = bdev_get_queue(rdev->bdev);
-
disk->head_position = 0;
disk->seq_start = MaxSector;
}
@@ -2942,9 +2995,6 @@ static struct r1conf *setup_conf(struct mddev *mddev)
conf->pending_count = 0;
conf->recovery_disabled = mddev->recovery_disabled - 1;
- conf->start_next_window = MaxSector;
- conf->current_window_requests = conf->next_window_requests = 0;
-
err = -EIO;
for (i = 0; i < conf->raid_disks * 2; i++) {
@@ -2987,6 +3037,12 @@ static struct r1conf *setup_conf(struct mddev *mddev)
kfree(conf->mirrors);
safe_put_page(conf->tmppage);
kfree(conf->poolinfo);
+ kfree(conf->nr_pending);
+ kfree(conf->nr_waiting);
+ kfree(conf->nr_queued);
+ kfree(conf->barrier);
+ if (conf->bio_split)
+ bioset_free(conf->bio_split);
kfree(conf);
}
return ERR_PTR(err);
@@ -3011,6 +3067,8 @@ static int raid1_run(struct mddev *mddev)
mdname(mddev));
return -EIO;
}
+ if (mddev_init_writes_pending(mddev) < 0)
+ return -ENOMEM;
/*
* copy the already verified devices into our private RAID1
* bookkeeping area. [whatever we allocate in run(),
@@ -3024,8 +3082,10 @@ static int raid1_run(struct mddev *mddev)
if (IS_ERR(conf))
return PTR_ERR(conf);
- if (mddev->queue)
+ if (mddev->queue) {
blk_queue_max_write_same_sectors(mddev->queue, 0);
+ blk_queue_max_write_zeroes_sectors(mddev->queue, 0);
+ }
rdev_for_each(rdev, mddev) {
if (!mddev->gendisk)
@@ -3088,6 +3148,12 @@ static void raid1_free(struct mddev *mddev, void *priv)
kfree(conf->mirrors);
safe_put_page(conf->tmppage);
kfree(conf->poolinfo);
+ kfree(conf->nr_pending);
+ kfree(conf->nr_waiting);
+ kfree(conf->nr_queued);
+ kfree(conf->barrier);
+ if (conf->bio_split)
+ bioset_free(conf->bio_split);
kfree(conf);
}
@@ -3110,8 +3176,6 @@ static int raid1_resize(struct mddev *mddev, sector_t sectors)
return ret;
}
md_set_array_sectors(mddev, newsize);
- set_capacity(mddev->gendisk, mddev->array_sectors);
- revalidate_disk(mddev->gendisk);
if (sectors > mddev->dev_sectors &&
mddev->recovery_cp > mddev->dev_sectors) {
mddev->recovery_cp = mddev->dev_sectors;
@@ -3141,7 +3205,7 @@ static int raid1_reshape(struct mddev *mddev)
struct r1conf *conf = mddev->private;
int cnt, raid_disks;
unsigned long flags;
- int d, d2, err;
+ int d, d2;
/* Cannot change chunk_size, layout, or level */
if (mddev->chunk_sectors != mddev->new_chunk_sectors ||
@@ -3153,11 +3217,8 @@ static int raid1_reshape(struct mddev *mddev)
return -EINVAL;
}
- if (!mddev_is_clustered(mddev)) {
- err = md_allow_write(mddev);
- if (err)
- return err;
- }
+ if (!mddev_is_clustered(mddev))
+ md_allow_write(mddev);
raid_disks = mddev->raid_disks + mddev->delta_disks;
diff --git a/drivers/md/raid1.h b/drivers/md/raid1.h
index c52ef424a24b..c8894ef1e9d2 100644
--- a/drivers/md/raid1.h
+++ b/drivers/md/raid1.h
@@ -1,6 +1,30 @@
#ifndef _RAID1_H
#define _RAID1_H
+/*
+ * each barrier unit size is 64MB fow now
+ * note: it must be larger than RESYNC_DEPTH
+ */
+#define BARRIER_UNIT_SECTOR_BITS 17
+#define BARRIER_UNIT_SECTOR_SIZE (1<<17)
+/*
+ * In struct r1conf, the following members are related to I/O barrier
+ * buckets,
+ * atomic_t *nr_pending;
+ * atomic_t *nr_waiting;
+ * atomic_t *nr_queued;
+ * atomic_t *barrier;
+ * Each of them points to array of atomic_t variables, each array is
+ * designed to have BARRIER_BUCKETS_NR elements and occupy a single
+ * memory page. The data width of atomic_t variables is 4 bytes, equal
+ * to 1<<(ilog2(sizeof(atomic_t))), BARRIER_BUCKETS_NR_BITS is defined
+ * as (PAGE_SHIFT - ilog2(sizeof(int))) to make sure an array of
+ * atomic_t variables with BARRIER_BUCKETS_NR elements just exactly
+ * occupies a single memory page.
+ */
+#define BARRIER_BUCKETS_NR_BITS (PAGE_SHIFT - ilog2(sizeof(atomic_t)))
+#define BARRIER_BUCKETS_NR (1<<BARRIER_BUCKETS_NR_BITS)
+
struct raid1_info {
struct md_rdev *rdev;
sector_t head_position;
@@ -35,25 +59,6 @@ struct r1conf {
*/
int raid_disks;
- /* During resync, read_balancing is only allowed on the part
- * of the array that has been resynced. 'next_resync' tells us
- * where that is.
- */
- sector_t next_resync;
-
- /* When raid1 starts resync, we divide array into four partitions
- * |---------|--------------|---------------------|-------------|
- * next_resync start_next_window end_window
- * start_next_window = next_resync + NEXT_NORMALIO_DISTANCE
- * end_window = start_next_window + NEXT_NORMALIO_DISTANCE
- * current_window_requests means the count of normalIO between
- * start_next_window and end_window.
- * next_window_requests means the count of normalIO after end_window.
- * */
- sector_t start_next_window;
- int current_window_requests;
- int next_window_requests;
-
spinlock_t device_lock;
/* list of 'struct r1bio' that need to be processed by raid1d,
@@ -79,10 +84,11 @@ struct r1conf {
*/
wait_queue_head_t wait_barrier;
spinlock_t resync_lock;
- int nr_pending;
- int nr_waiting;
- int nr_queued;
- int barrier;
+ atomic_t nr_sync_pending;
+ atomic_t *nr_pending;
+ atomic_t *nr_waiting;
+ atomic_t *nr_queued;
+ atomic_t *barrier;
int array_frozen;
/* Set to 1 if a full sync is needed, (fresh device added).
@@ -102,6 +108,8 @@ struct r1conf {
mempool_t *r1bio_pool;
mempool_t *r1buf_pool;
+ struct bio_set *bio_split;
+
/* temporary buffer to synchronous IO when attempting to repair
* a read error.
*/
@@ -135,7 +143,6 @@ struct r1bio {
* in this BehindIO request
*/
sector_t sector;
- sector_t start_next_window;
int sectors;
unsigned long state;
struct mddev *mddev;
@@ -149,9 +156,13 @@ struct r1bio {
int read_disk;
struct list_head retry_list;
- /* Next two are only valid when R1BIO_BehindIO is set */
- struct bio_vec *behind_bvecs;
- int behind_page_count;
+
+ /*
+ * When R1BIO_BehindIO is set, we store pages for write behind
+ * in behind_master_bio.
+ */
+ struct bio *behind_master_bio;
+
/*
* if the IO is in WRITE direction, then multiple bios are used.
* We choose the number when they are allocated.
@@ -185,4 +196,10 @@ enum r1bio_state {
R1BIO_WriteError,
R1BIO_FailFast,
};
+
+static inline int sector_to_idx(sector_t sector)
+{
+ return hash_long(sector >> BARRIER_UNIT_SECTOR_BITS,
+ BARRIER_BUCKETS_NR_BITS);
+}
#endif
diff --git a/drivers/md/raid10.c b/drivers/md/raid10.c
index 1920756828df..5026e7ad51d3 100644
--- a/drivers/md/raid10.c
+++ b/drivers/md/raid10.c
@@ -110,6 +110,24 @@ static void end_reshape(struct r10conf *conf);
#define raid10_log(md, fmt, args...) \
do { if ((md)->queue) blk_add_trace_msg((md)->queue, "raid10 " fmt, ##args); } while (0)
+/*
+ * 'strct resync_pages' stores actual pages used for doing the resync
+ * IO, and it is per-bio, so make .bi_private points to it.
+ */
+static inline struct resync_pages *get_resync_pages(struct bio *bio)
+{
+ return bio->bi_private;
+}
+
+/*
+ * for resync bio, r10bio pointer can be retrieved from the per-bio
+ * 'struct resync_pages'.
+ */
+static inline struct r10bio *get_resync_r10bio(struct bio *bio)
+{
+ return get_resync_pages(bio)->raid_bio;
+}
+
static void * r10bio_pool_alloc(gfp_t gfp_flags, void *data)
{
struct r10conf *conf = data;
@@ -125,9 +143,6 @@ static void r10bio_pool_free(void *r10_bio, void *data)
kfree(r10_bio);
}
-/* Maximum size of each resync request */
-#define RESYNC_BLOCK_SIZE (64*1024)
-#define RESYNC_PAGES ((RESYNC_BLOCK_SIZE + PAGE_SIZE-1) / PAGE_SIZE)
/* amount of memory to reserve for resync requests */
#define RESYNC_WINDOW (1024*1024)
/* maximum number of concurrent requests, memory permitting */
@@ -143,11 +158,11 @@ static void r10bio_pool_free(void *r10_bio, void *data)
static void * r10buf_pool_alloc(gfp_t gfp_flags, void *data)
{
struct r10conf *conf = data;
- struct page *page;
struct r10bio *r10_bio;
struct bio *bio;
- int i, j;
- int nalloc;
+ int j;
+ int nalloc, nalloc_rp;
+ struct resync_pages *rps;
r10_bio = r10bio_pool_alloc(gfp_flags, conf);
if (!r10_bio)
@@ -159,6 +174,15 @@ static void * r10buf_pool_alloc(gfp_t gfp_flags, void *data)
else
nalloc = 2; /* recovery */
+ /* allocate once for all bios */
+ if (!conf->have_replacement)
+ nalloc_rp = nalloc;
+ else
+ nalloc_rp = nalloc * 2;
+ rps = kmalloc(sizeof(struct resync_pages) * nalloc_rp, gfp_flags);
+ if (!rps)
+ goto out_free_r10bio;
+
/*
* Allocate bios.
*/
@@ -178,36 +202,40 @@ static void * r10buf_pool_alloc(gfp_t gfp_flags, void *data)
* Allocate RESYNC_PAGES data pages and attach them
* where needed.
*/
- for (j = 0 ; j < nalloc; j++) {
+ for (j = 0; j < nalloc; j++) {
struct bio *rbio = r10_bio->devs[j].repl_bio;
+ struct resync_pages *rp, *rp_repl;
+
+ rp = &rps[j];
+ if (rbio)
+ rp_repl = &rps[nalloc + j];
+
bio = r10_bio->devs[j].bio;
- for (i = 0; i < RESYNC_PAGES; i++) {
- if (j > 0 && !test_bit(MD_RECOVERY_SYNC,
- &conf->mddev->recovery)) {
- /* we can share bv_page's during recovery
- * and reshape */
- struct bio *rbio = r10_bio->devs[0].bio;
- page = rbio->bi_io_vec[i].bv_page;
- get_page(page);
- } else
- page = alloc_page(gfp_flags);
- if (unlikely(!page))
+
+ if (!j || test_bit(MD_RECOVERY_SYNC,
+ &conf->mddev->recovery)) {
+ if (resync_alloc_pages(rp, gfp_flags))
goto out_free_pages;
+ } else {
+ memcpy(rp, &rps[0], sizeof(*rp));
+ resync_get_all_pages(rp);
+ }
- bio->bi_io_vec[i].bv_page = page;
- if (rbio)
- rbio->bi_io_vec[i].bv_page = page;
+ rp->idx = 0;
+ rp->raid_bio = r10_bio;
+ bio->bi_private = rp;
+ if (rbio) {
+ memcpy(rp_repl, rp, sizeof(*rp));
+ rbio->bi_private = rp_repl;
}
}
return r10_bio;
out_free_pages:
- for ( ; i > 0 ; i--)
- safe_put_page(bio->bi_io_vec[i-1].bv_page);
- while (j--)
- for (i = 0; i < RESYNC_PAGES ; i++)
- safe_put_page(r10_bio->devs[j].bio->bi_io_vec[i].bv_page);
+ while (--j >= 0)
+ resync_free_pages(&rps[j * 2]);
+
j = 0;
out_free_bio:
for ( ; j < nalloc; j++) {
@@ -216,30 +244,34 @@ out_free_bio:
if (r10_bio->devs[j].repl_bio)
bio_put(r10_bio->devs[j].repl_bio);
}
+ kfree(rps);
+out_free_r10bio:
r10bio_pool_free(r10_bio, conf);
return NULL;
}
static void r10buf_pool_free(void *__r10_bio, void *data)
{
- int i;
struct r10conf *conf = data;
struct r10bio *r10bio = __r10_bio;
int j;
+ struct resync_pages *rp = NULL;
- for (j=0; j < conf->copies; j++) {
+ for (j = conf->copies; j--; ) {
struct bio *bio = r10bio->devs[j].bio;
- if (bio) {
- for (i = 0; i < RESYNC_PAGES; i++) {
- safe_put_page(bio->bi_io_vec[i].bv_page);
- bio->bi_io_vec[i].bv_page = NULL;
- }
- bio_put(bio);
- }
+
+ rp = get_resync_pages(bio);
+ resync_free_pages(rp);
+ bio_put(bio);
+
bio = r10bio->devs[j].repl_bio;
if (bio)
bio_put(bio);
}
+
+ /* resync pages array stored in the 1st bio's .bi_private */
+ kfree(rp);
+
r10bio_pool_free(r10bio, conf);
}
@@ -301,27 +333,18 @@ static void reschedule_retry(struct r10bio *r10_bio)
static void raid_end_bio_io(struct r10bio *r10_bio)
{
struct bio *bio = r10_bio->master_bio;
- int done;
struct r10conf *conf = r10_bio->mddev->private;
- if (bio->bi_phys_segments) {
- unsigned long flags;
- spin_lock_irqsave(&conf->device_lock, flags);
- bio->bi_phys_segments--;
- done = (bio->bi_phys_segments == 0);
- spin_unlock_irqrestore(&conf->device_lock, flags);
- } else
- done = 1;
if (!test_bit(R10BIO_Uptodate, &r10_bio->state))
- bio->bi_error = -EIO;
- if (done) {
- bio_endio(bio);
- /*
- * Wake up any possible resync thread that waits for the device
- * to go idle.
- */
- allow_barrier(conf);
- }
+ bio->bi_status = BLK_STS_IOERR;
+
+ bio_endio(bio);
+ /*
+ * Wake up any possible resync thread that waits for the device
+ * to go idle.
+ */
+ allow_barrier(conf);
+
free_r10bio(r10_bio);
}
@@ -366,7 +389,7 @@ static int find_bio_disk(struct r10conf *conf, struct r10bio *r10_bio,
static void raid10_end_read_request(struct bio *bio)
{
- int uptodate = !bio->bi_error;
+ int uptodate = !bio->bi_status;
struct r10bio *r10_bio = bio->bi_private;
int slot, dev;
struct md_rdev *rdev;
@@ -454,7 +477,7 @@ static void raid10_end_write_request(struct bio *bio)
struct bio *to_put = NULL;
bool discard_error;
- discard_error = bio->bi_error && bio_op(bio) == REQ_OP_DISCARD;
+ discard_error = bio->bi_status && bio_op(bio) == REQ_OP_DISCARD;
dev = find_bio_disk(conf, r10_bio, bio, &slot, &repl);
@@ -468,7 +491,7 @@ static void raid10_end_write_request(struct bio *bio)
/*
* this branch is our 'one mirror IO has finished' event handler:
*/
- if (bio->bi_error && !discard_error) {
+ if (bio->bi_status && !discard_error) {
if (repl)
/* Never record new bad blocks to replacement,
* just fail it.
@@ -860,7 +883,7 @@ static int raid10_congested(struct mddev *mddev, int bits)
if (rdev && !test_bit(Faulty, &rdev->flags)) {
struct request_queue *q = bdev_get_queue(rdev->bdev);
- ret |= bdi_congested(&q->backing_dev_info, bits);
+ ret |= bdi_congested(q->backing_dev_info, bits);
}
}
rcu_read_unlock();
@@ -890,7 +913,7 @@ static void flush_pending_writes(struct r10conf *conf)
bio->bi_next = NULL;
bio->bi_bdev = rdev->bdev;
if (test_bit(Faulty, &rdev->flags)) {
- bio->bi_error = -EIO;
+ bio->bi_status = BLK_STS_IOERR;
bio_endio(bio);
} else if (unlikely((bio_op(bio) == REQ_OP_DISCARD) &&
!blk_queue_discard(bdev_get_queue(bio->bi_bdev))))
@@ -974,7 +997,8 @@ static void wait_barrier(struct r10conf *conf)
!conf->barrier ||
(atomic_read(&conf->nr_pending) &&
current->bio_list &&
- !bio_list_empty(current->bio_list)),
+ (!bio_list_empty(&current->bio_list[0]) ||
+ !bio_list_empty(&current->bio_list[1]))),
conf->resync_lock);
conf->nr_waiting--;
if (!conf->nr_waiting)
@@ -1074,7 +1098,7 @@ static void raid10_unplug(struct blk_plug_cb *cb, bool from_schedule)
bio->bi_next = NULL;
bio->bi_bdev = rdev->bdev;
if (test_bit(Faulty, &rdev->flags)) {
- bio->bi_error = -EIO;
+ bio->bi_status = BLK_STS_IOERR;
bio_endio(bio);
} else if (unlikely((bio_op(bio) == REQ_OP_DISCARD) &&
!blk_queue_discard(bdev_get_queue(bio->bi_bdev))))
@@ -1094,12 +1118,41 @@ static void raid10_read_request(struct mddev *mddev, struct bio *bio,
struct bio *read_bio;
const int op = bio_op(bio);
const unsigned long do_sync = (bio->bi_opf & REQ_SYNC);
- int sectors_handled;
int max_sectors;
sector_t sectors;
struct md_rdev *rdev;
- int slot;
+ char b[BDEVNAME_SIZE];
+ int slot = r10_bio->read_slot;
+ struct md_rdev *err_rdev = NULL;
+ gfp_t gfp = GFP_NOIO;
+ if (r10_bio->devs[slot].rdev) {
+ /*
+ * This is an error retry, but we cannot
+ * safely dereference the rdev in the r10_bio,
+ * we must use the one in conf.
+ * If it has already been disconnected (unlikely)
+ * we lose the device name in error messages.
+ */
+ int disk;
+ /*
+ * As we are blocking raid10, it is a little safer to
+ * use __GFP_HIGH.
+ */
+ gfp = GFP_NOIO | __GFP_HIGH;
+
+ rcu_read_lock();
+ disk = r10_bio->devs[slot].devnum;
+ err_rdev = rcu_dereference(conf->mirrors[disk].rdev);
+ if (err_rdev)
+ bdevname(err_rdev->bdev, b);
+ else {
+ strcpy(b, "???");
+ /* This never gets dereferenced */
+ err_rdev = r10_bio->devs[slot].rdev;
+ }
+ rcu_read_unlock();
+ }
/*
* Register the new request and wait if the reconstruction
* thread has put up a bar for new requests.
@@ -1107,7 +1160,7 @@ static void raid10_read_request(struct mddev *mddev, struct bio *bio,
*/
wait_barrier(conf);
- sectors = bio_sectors(bio);
+ sectors = r10_bio->sectors;
while (test_bit(MD_RECOVERY_RESHAPE, &mddev->recovery) &&
bio->bi_iter.bi_sector < conf->reshape_progress &&
bio->bi_iter.bi_sector + sectors > conf->reshape_progress) {
@@ -1124,17 +1177,33 @@ static void raid10_read_request(struct mddev *mddev, struct bio *bio,
wait_barrier(conf);
}
-read_again:
rdev = read_balance(conf, r10_bio, &max_sectors);
if (!rdev) {
+ if (err_rdev) {
+ pr_crit_ratelimited("md/raid10:%s: %s: unrecoverable I/O read error for block %llu\n",
+ mdname(mddev), b,
+ (unsigned long long)r10_bio->sector);
+ }
raid_end_bio_io(r10_bio);
return;
}
+ if (err_rdev)
+ pr_err_ratelimited("md/raid10:%s: %s: redirecting sector %llu to another mirror\n",
+ mdname(mddev),
+ bdevname(rdev->bdev, b),
+ (unsigned long long)r10_bio->sector);
+ if (max_sectors < bio_sectors(bio)) {
+ struct bio *split = bio_split(bio, max_sectors,
+ gfp, conf->bio_split);
+ bio_chain(split, bio);
+ generic_make_request(bio);
+ bio = split;
+ r10_bio->master_bio = bio;
+ r10_bio->sectors = max_sectors;
+ }
slot = r10_bio->read_slot;
- read_bio = bio_clone_mddev(bio, GFP_NOIO, mddev);
- bio_trim(read_bio, r10_bio->sector - bio->bi_iter.bi_sector,
- max_sectors);
+ read_bio = bio_clone_fast(bio, gfp, mddev->bio_set);
r10_bio->devs[slot].bio = read_bio;
r10_bio->devs[slot].rdev = rdev;
@@ -1153,59 +1222,87 @@ read_again:
trace_block_bio_remap(bdev_get_queue(read_bio->bi_bdev),
read_bio, disk_devt(mddev->gendisk),
r10_bio->sector);
- if (max_sectors < r10_bio->sectors) {
- /*
- * Could not read all from this device, so we will need another
- * r10_bio.
- */
- sectors_handled = (r10_bio->sector + max_sectors
- - bio->bi_iter.bi_sector);
- r10_bio->sectors = max_sectors;
- spin_lock_irq(&conf->device_lock);
- if (bio->bi_phys_segments == 0)
- bio->bi_phys_segments = 2;
- else
- bio->bi_phys_segments++;
- spin_unlock_irq(&conf->device_lock);
- /*
- * Cannot call generic_make_request directly as that will be
- * queued in __generic_make_request and subsequent
- * mempool_alloc might block waiting for it. so hand bio over
- * to raid10d.
- */
- reschedule_retry(r10_bio);
-
- r10_bio = mempool_alloc(conf->r10bio_pool, GFP_NOIO);
-
- r10_bio->master_bio = bio;
- r10_bio->sectors = bio_sectors(bio) - sectors_handled;
- r10_bio->state = 0;
- r10_bio->mddev = mddev;
- r10_bio->sector = bio->bi_iter.bi_sector + sectors_handled;
- goto read_again;
- } else
- generic_make_request(read_bio);
+ generic_make_request(read_bio);
return;
}
-static void raid10_write_request(struct mddev *mddev, struct bio *bio,
- struct r10bio *r10_bio)
+static void raid10_write_one_disk(struct mddev *mddev, struct r10bio *r10_bio,
+ struct bio *bio, bool replacement,
+ int n_copy)
{
- struct r10conf *conf = mddev->private;
- int i;
const int op = bio_op(bio);
const unsigned long do_sync = (bio->bi_opf & REQ_SYNC);
const unsigned long do_fua = (bio->bi_opf & REQ_FUA);
unsigned long flags;
- struct md_rdev *blocked_rdev;
struct blk_plug_cb *cb;
struct raid10_plug_cb *plug = NULL;
+ struct r10conf *conf = mddev->private;
+ struct md_rdev *rdev;
+ int devnum = r10_bio->devs[n_copy].devnum;
+ struct bio *mbio;
+
+ if (replacement) {
+ rdev = conf->mirrors[devnum].replacement;
+ if (rdev == NULL) {
+ /* Replacement just got moved to main 'rdev' */
+ smp_mb();
+ rdev = conf->mirrors[devnum].rdev;
+ }
+ } else
+ rdev = conf->mirrors[devnum].rdev;
+
+ mbio = bio_clone_fast(bio, GFP_NOIO, mddev->bio_set);
+ if (replacement)
+ r10_bio->devs[n_copy].repl_bio = mbio;
+ else
+ r10_bio->devs[n_copy].bio = mbio;
+
+ mbio->bi_iter.bi_sector = (r10_bio->devs[n_copy].addr +
+ choose_data_offset(r10_bio, rdev));
+ mbio->bi_bdev = rdev->bdev;
+ mbio->bi_end_io = raid10_end_write_request;
+ bio_set_op_attrs(mbio, op, do_sync | do_fua);
+ if (!replacement && test_bit(FailFast,
+ &conf->mirrors[devnum].rdev->flags)
+ && enough(conf, devnum))
+ mbio->bi_opf |= MD_FAILFAST;
+ mbio->bi_private = r10_bio;
+
+ if (conf->mddev->gendisk)
+ trace_block_bio_remap(bdev_get_queue(mbio->bi_bdev),
+ mbio, disk_devt(conf->mddev->gendisk),
+ r10_bio->sector);
+ /* flush_pending_writes() needs access to the rdev so...*/
+ mbio->bi_bdev = (void *)rdev;
+
+ atomic_inc(&r10_bio->remaining);
+
+ cb = blk_check_plugged(raid10_unplug, mddev, sizeof(*plug));
+ if (cb)
+ plug = container_of(cb, struct raid10_plug_cb, cb);
+ else
+ plug = NULL;
+ if (plug) {
+ bio_list_add(&plug->pending, mbio);
+ plug->pending_cnt++;
+ } else {
+ spin_lock_irqsave(&conf->device_lock, flags);
+ bio_list_add(&conf->pending_bio_list, mbio);
+ conf->pending_count++;
+ spin_unlock_irqrestore(&conf->device_lock, flags);
+ md_wakeup_thread(mddev->thread);
+ }
+}
+
+static void raid10_write_request(struct mddev *mddev, struct bio *bio,
+ struct r10bio *r10_bio)
+{
+ struct r10conf *conf = mddev->private;
+ int i;
+ struct md_rdev *blocked_rdev;
sector_t sectors;
- int sectors_handled;
int max_sectors;
- md_write_start(mddev, bio);
-
/*
* Register the new request and wait if the reconstruction
* thread has put up a bar for new requests.
@@ -1213,7 +1310,7 @@ static void raid10_write_request(struct mddev *mddev, struct bio *bio,
*/
wait_barrier(conf);
- sectors = bio_sectors(bio);
+ sectors = r10_bio->sectors;
while (test_bit(MD_RECOVERY_RESHAPE, &mddev->recovery) &&
bio->bi_iter.bi_sector < conf->reshape_progress &&
bio->bi_iter.bi_sector + sectors > conf->reshape_progress) {
@@ -1261,9 +1358,7 @@ static void raid10_write_request(struct mddev *mddev, struct bio *bio,
* on which we have seen a write error, we want to avoid
* writing to those blocks. This potentially requires several
* writes to write around the bad blocks. Each set of writes
- * gets its own r10_bio with a set of bios attached. The number
- * of r10_bios is recored in bio->bi_phys_segments just as with
- * the read case.
+ * gets its own r10_bio with a set of bios attached.
*/
r10_bio->read_slot = -1; /* make sure repl_bio gets freed */
@@ -1383,132 +1478,31 @@ retry_write:
goto retry_write;
}
- if (max_sectors < r10_bio->sectors) {
- /* We are splitting this into multiple parts, so
- * we need to prepare for allocating another r10_bio.
- */
+ if (max_sectors < r10_bio->sectors)
r10_bio->sectors = max_sectors;
- spin_lock_irq(&conf->device_lock);
- if (bio->bi_phys_segments == 0)
- bio->bi_phys_segments = 2;
- else
- bio->bi_phys_segments++;
- spin_unlock_irq(&conf->device_lock);
+
+ if (r10_bio->sectors < bio_sectors(bio)) {
+ struct bio *split = bio_split(bio, r10_bio->sectors,
+ GFP_NOIO, conf->bio_split);
+ bio_chain(split, bio);
+ generic_make_request(bio);
+ bio = split;
+ r10_bio->master_bio = bio;
}
- sectors_handled = r10_bio->sector + max_sectors -
- bio->bi_iter.bi_sector;
atomic_set(&r10_bio->remaining, 1);
bitmap_startwrite(mddev->bitmap, r10_bio->sector, r10_bio->sectors, 0);
for (i = 0; i < conf->copies; i++) {
- struct bio *mbio;
- int d = r10_bio->devs[i].devnum;
- if (r10_bio->devs[i].bio) {
- struct md_rdev *rdev = conf->mirrors[d].rdev;
- mbio = bio_clone_mddev(bio, GFP_NOIO, mddev);
- bio_trim(mbio, r10_bio->sector - bio->bi_iter.bi_sector,
- max_sectors);
- r10_bio->devs[i].bio = mbio;
-
- mbio->bi_iter.bi_sector = (r10_bio->devs[i].addr+
- choose_data_offset(r10_bio, rdev));
- mbio->bi_bdev = rdev->bdev;
- mbio->bi_end_io = raid10_end_write_request;
- bio_set_op_attrs(mbio, op, do_sync | do_fua);
- if (test_bit(FailFast, &conf->mirrors[d].rdev->flags) &&
- enough(conf, d))
- mbio->bi_opf |= MD_FAILFAST;
- mbio->bi_private = r10_bio;
-
- if (conf->mddev->gendisk)
- trace_block_bio_remap(bdev_get_queue(mbio->bi_bdev),
- mbio, disk_devt(conf->mddev->gendisk),
- r10_bio->sector);
- /* flush_pending_writes() needs access to the rdev so...*/
- mbio->bi_bdev = (void*)rdev;
-
- atomic_inc(&r10_bio->remaining);
-
- cb = blk_check_plugged(raid10_unplug, mddev,
- sizeof(*plug));
- if (cb)
- plug = container_of(cb, struct raid10_plug_cb,
- cb);
- else
- plug = NULL;
- spin_lock_irqsave(&conf->device_lock, flags);
- if (plug) {
- bio_list_add(&plug->pending, mbio);
- plug->pending_cnt++;
- } else {
- bio_list_add(&conf->pending_bio_list, mbio);
- conf->pending_count++;
- }
- spin_unlock_irqrestore(&conf->device_lock, flags);
- if (!plug)
- md_wakeup_thread(mddev->thread);
- }
-
- if (r10_bio->devs[i].repl_bio) {
- struct md_rdev *rdev = conf->mirrors[d].replacement;
- if (rdev == NULL) {
- /* Replacement just got moved to main 'rdev' */
- smp_mb();
- rdev = conf->mirrors[d].rdev;
- }
- mbio = bio_clone_mddev(bio, GFP_NOIO, mddev);
- bio_trim(mbio, r10_bio->sector - bio->bi_iter.bi_sector,
- max_sectors);
- r10_bio->devs[i].repl_bio = mbio;
-
- mbio->bi_iter.bi_sector = (r10_bio->devs[i].addr +
- choose_data_offset(r10_bio, rdev));
- mbio->bi_bdev = rdev->bdev;
- mbio->bi_end_io = raid10_end_write_request;
- bio_set_op_attrs(mbio, op, do_sync | do_fua);
- mbio->bi_private = r10_bio;
-
- if (conf->mddev->gendisk)
- trace_block_bio_remap(bdev_get_queue(mbio->bi_bdev),
- mbio, disk_devt(conf->mddev->gendisk),
- r10_bio->sector);
- /* flush_pending_writes() needs access to the rdev so...*/
- mbio->bi_bdev = (void*)rdev;
-
- atomic_inc(&r10_bio->remaining);
- spin_lock_irqsave(&conf->device_lock, flags);
- bio_list_add(&conf->pending_bio_list, mbio);
- conf->pending_count++;
- spin_unlock_irqrestore(&conf->device_lock, flags);
- if (!mddev_check_plugged(mddev))
- md_wakeup_thread(mddev->thread);
- }
- }
-
- /* Don't remove the bias on 'remaining' (one_write_done) until
- * after checking if we need to go around again.
- */
-
- if (sectors_handled < bio_sectors(bio)) {
- one_write_done(r10_bio);
- /* We need another r10_bio. It has already been counted
- * in bio->bi_phys_segments.
- */
- r10_bio = mempool_alloc(conf->r10bio_pool, GFP_NOIO);
-
- r10_bio->master_bio = bio;
- r10_bio->sectors = bio_sectors(bio) - sectors_handled;
-
- r10_bio->mddev = mddev;
- r10_bio->sector = bio->bi_iter.bi_sector + sectors_handled;
- r10_bio->state = 0;
- goto retry_write;
+ if (r10_bio->devs[i].bio)
+ raid10_write_one_disk(mddev, r10_bio, bio, false, i);
+ if (r10_bio->devs[i].repl_bio)
+ raid10_write_one_disk(mddev, r10_bio, bio, true, i);
}
one_write_done(r10_bio);
}
-static void __make_request(struct mddev *mddev, struct bio *bio)
+static void __make_request(struct mddev *mddev, struct bio *bio, int sectors)
{
struct r10conf *conf = mddev->private;
struct r10bio *r10_bio;
@@ -1516,21 +1510,12 @@ static void __make_request(struct mddev *mddev, struct bio *bio)
r10_bio = mempool_alloc(conf->r10bio_pool, GFP_NOIO);
r10_bio->master_bio = bio;
- r10_bio->sectors = bio_sectors(bio);
+ r10_bio->sectors = sectors;
r10_bio->mddev = mddev;
r10_bio->sector = bio->bi_iter.bi_sector;
r10_bio->state = 0;
-
- /*
- * We might need to issue multiple reads to different devices if there
- * are bad blocks around, so we keep track of the number of reads in
- * bio->bi_phys_segments. If this is 0, there is only one r10_bio and
- * no locking will be needed when the request completes. If it is
- * non-zero, then it is the number of not-completed requests.
- */
- bio->bi_phys_segments = 0;
- bio_clear_flag(bio, BIO_SEG_VALID);
+ memset(r10_bio->devs, 0, sizeof(r10_bio->devs[0]) * conf->copies);
if (bio_data_dir(bio) == READ)
raid10_read_request(mddev, bio, r10_bio);
@@ -1538,44 +1523,38 @@ static void __make_request(struct mddev *mddev, struct bio *bio)
raid10_write_request(mddev, bio, r10_bio);
}
-static void raid10_make_request(struct mddev *mddev, struct bio *bio)
+static bool raid10_make_request(struct mddev *mddev, struct bio *bio)
{
struct r10conf *conf = mddev->private;
sector_t chunk_mask = (conf->geo.chunk_mask & conf->prev.chunk_mask);
int chunk_sects = chunk_mask + 1;
-
- struct bio *split;
+ int sectors = bio_sectors(bio);
if (unlikely(bio->bi_opf & REQ_PREFLUSH)) {
md_flush_request(mddev, bio);
- return;
+ return true;
}
- do {
-
- /*
- * If this request crosses a chunk boundary, we need to split
- * it.
- */
- if (unlikely((bio->bi_iter.bi_sector & chunk_mask) +
- bio_sectors(bio) > chunk_sects
- && (conf->geo.near_copies < conf->geo.raid_disks
- || conf->prev.near_copies <
- conf->prev.raid_disks))) {
- split = bio_split(bio, chunk_sects -
- (bio->bi_iter.bi_sector &
- (chunk_sects - 1)),
- GFP_NOIO, fs_bio_set);
- bio_chain(split, bio);
- } else {
- split = bio;
- }
+ if (!md_write_start(mddev, bio))
+ return false;
- __make_request(mddev, split);
- } while (split != bio);
+ /*
+ * If this request crosses a chunk boundary, we need to split
+ * it.
+ */
+ if (unlikely((bio->bi_iter.bi_sector & chunk_mask) +
+ sectors > chunk_sects
+ && (conf->geo.near_copies < conf->geo.raid_disks
+ || conf->prev.near_copies <
+ conf->prev.raid_disks)))
+ sectors = chunk_sects -
+ (bio->bi_iter.bi_sector &
+ (chunk_sects - 1));
+ __make_request(mddev, bio, sectors);
/* In case raid10d snuck in to freeze_array */
wake_up(&conf->wait_barrier);
+ return true;
}
static void raid10_status(struct seq_file *seq, struct mddev *mddev)
@@ -1896,13 +1875,9 @@ static int raid10_remove_disk(struct mddev *mddev, struct md_rdev *rdev)
* but will never see neither -- if they are careful.
*/
p->replacement = NULL;
- clear_bit(WantReplacement, &rdev->flags);
- } else
- /* We might have just remove the Replacement as faulty
- * Clear the flag just in case
- */
- clear_bit(WantReplacement, &rdev->flags);
+ }
+ clear_bit(WantReplacement, &rdev->flags);
err = md_integrity_register(mddev);
abort:
@@ -1911,19 +1886,11 @@ abort:
return err;
}
-static void end_sync_read(struct bio *bio)
+static void __end_sync_read(struct r10bio *r10_bio, struct bio *bio, int d)
{
- struct r10bio *r10_bio = bio->bi_private;
struct r10conf *conf = r10_bio->mddev->private;
- int d;
- if (bio == r10_bio->master_bio) {
- /* this is a reshape read */
- d = r10_bio->read_slot; /* really the read dev */
- } else
- d = find_bio_disk(conf, r10_bio, bio, NULL, NULL);
-
- if (!bio->bi_error)
+ if (!bio->bi_status)
set_bit(R10BIO_Uptodate, &r10_bio->state);
else
/* The write handler will notice the lack of
@@ -1945,6 +1912,23 @@ static void end_sync_read(struct bio *bio)
}
}
+static void end_sync_read(struct bio *bio)
+{
+ struct r10bio *r10_bio = get_resync_r10bio(bio);
+ struct r10conf *conf = r10_bio->mddev->private;
+ int d = find_bio_disk(conf, r10_bio, bio, NULL, NULL);
+
+ __end_sync_read(r10_bio, bio, d);
+}
+
+static void end_reshape_read(struct bio *bio)
+{
+ /* reshape read bio isn't allocated from r10buf_pool */
+ struct r10bio *r10_bio = bio->bi_private;
+
+ __end_sync_read(r10_bio, bio, r10_bio->read_slot);
+}
+
static void end_sync_request(struct r10bio *r10_bio)
{
struct mddev *mddev = r10_bio->mddev;
@@ -1974,7 +1958,7 @@ static void end_sync_request(struct r10bio *r10_bio)
static void end_sync_write(struct bio *bio)
{
- struct r10bio *r10_bio = bio->bi_private;
+ struct r10bio *r10_bio = get_resync_r10bio(bio);
struct mddev *mddev = r10_bio->mddev;
struct r10conf *conf = mddev->private;
int d;
@@ -1990,7 +1974,7 @@ static void end_sync_write(struct bio *bio)
else
rdev = conf->mirrors[d].rdev;
- if (bio->bi_error) {
+ if (bio->bi_status) {
if (repl)
md_error(mddev, rdev);
else {
@@ -2033,12 +2017,13 @@ static void sync_request_write(struct mddev *mddev, struct r10bio *r10_bio)
int i, first;
struct bio *tbio, *fbio;
int vcnt;
+ struct page **tpages, **fpages;
atomic_set(&r10_bio->remaining, 1);
/* find the first device with a block */
for (i=0; i<conf->copies; i++)
- if (!r10_bio->devs[i].bio->bi_error)
+ if (!r10_bio->devs[i].bio->bi_status)
break;
if (i == conf->copies)
@@ -2048,12 +2033,14 @@ static void sync_request_write(struct mddev *mddev, struct r10bio *r10_bio)
fbio = r10_bio->devs[i].bio;
fbio->bi_iter.bi_size = r10_bio->sectors << 9;
fbio->bi_iter.bi_idx = 0;
+ fpages = get_resync_pages(fbio)->pages;
vcnt = (r10_bio->sectors + (PAGE_SIZE >> 9) - 1) >> (PAGE_SHIFT - 9);
/* now find blocks with errors */
for (i=0 ; i < conf->copies ; i++) {
int j, d;
struct md_rdev *rdev;
+ struct resync_pages *rp;
tbio = r10_bio->devs[i].bio;
@@ -2061,9 +2048,11 @@ static void sync_request_write(struct mddev *mddev, struct r10bio *r10_bio)
continue;
if (i == first)
continue;
+
+ tpages = get_resync_pages(tbio)->pages;
d = r10_bio->devs[i].devnum;
rdev = conf->mirrors[d].rdev;
- if (!r10_bio->devs[i].bio->bi_error) {
+ if (!r10_bio->devs[i].bio->bi_status) {
/* We know that the bi_io_vec layout is the same for
* both 'first' and 'i', so we just compare them.
* All vec entries are PAGE_SIZE;
@@ -2073,8 +2062,8 @@ static void sync_request_write(struct mddev *mddev, struct r10bio *r10_bio)
int len = PAGE_SIZE;
if (sectors < (len / 512))
len = sectors * 512;
- if (memcmp(page_address(fbio->bi_io_vec[j].bv_page),
- page_address(tbio->bi_io_vec[j].bv_page),
+ if (memcmp(page_address(fpages[j]),
+ page_address(tpages[j]),
len))
break;
sectors -= len/512;
@@ -2095,11 +2084,13 @@ static void sync_request_write(struct mddev *mddev, struct r10bio *r10_bio)
* First we need to fixup bv_offset, bv_len and
* bi_vecs, as the read request might have corrupted these
*/
+ rp = get_resync_pages(tbio);
bio_reset(tbio);
tbio->bi_vcnt = vcnt;
tbio->bi_iter.bi_size = fbio->bi_iter.bi_size;
- tbio->bi_private = r10_bio;
+ rp->raid_bio = r10_bio;
+ tbio->bi_private = rp;
tbio->bi_iter.bi_sector = r10_bio->devs[i].addr;
tbio->bi_end_io = end_sync_write;
bio_set_op_attrs(tbio, REQ_OP_WRITE, 0);
@@ -2170,6 +2161,7 @@ static void fix_recovery_read_error(struct r10bio *r10_bio)
int idx = 0;
int dr = r10_bio->devs[0].devnum;
int dw = r10_bio->devs[1].devnum;
+ struct page **pages = get_resync_pages(bio)->pages;
while (sectors) {
int s = sectors;
@@ -2185,7 +2177,7 @@ static void fix_recovery_read_error(struct r10bio *r10_bio)
ok = sync_page_io(rdev,
addr,
s << 9,
- bio->bi_io_vec[idx].bv_page,
+ pages[idx],
REQ_OP_READ, 0, false);
if (ok) {
rdev = conf->mirrors[dw].rdev;
@@ -2193,7 +2185,7 @@ static void fix_recovery_read_error(struct r10bio *r10_bio)
ok = sync_page_io(rdev,
addr,
s << 9,
- bio->bi_io_vec[idx].bv_page,
+ pages[idx],
REQ_OP_WRITE, 0, false);
if (!ok) {
set_bit(WriteErrorSeen, &rdev->flags);
@@ -2565,7 +2557,7 @@ static int narrow_write_error(struct r10bio *r10_bio, int i)
if (sectors > sect_to_write)
sectors = sect_to_write;
/* Write at 'sector' for 'sectors' */
- wbio = bio_clone_mddev(bio, GFP_NOIO, mddev);
+ wbio = bio_clone_fast(bio, GFP_NOIO, mddev->bio_set);
bio_trim(wbio, sector - bio->bi_iter.bi_sector, sectors);
wsector = r10_bio->devs[i].addr + (sector - r10_bio->sector);
wbio->bi_iter.bi_sector = wsector +
@@ -2593,9 +2585,6 @@ static void handle_read_error(struct mddev *mddev, struct r10bio *r10_bio)
struct bio *bio;
struct r10conf *conf = mddev->private;
struct md_rdev *rdev = r10_bio->devs[slot].rdev;
- char b[BDEVNAME_SIZE];
- unsigned long do_sync;
- int max_sectors;
dev_t bio_dev;
sector_t bio_last_sector;
@@ -2608,7 +2597,6 @@ static void handle_read_error(struct mddev *mddev, struct r10bio *r10_bio)
* frozen.
*/
bio = r10_bio->devs[slot].bio;
- bdevname(bio->bi_bdev, b);
bio_dev = bio->bi_bdev->bd_dev;
bio_last_sector = r10_bio->devs[slot].addr + rdev->data_offset + r10_bio->sectors;
bio_put(bio);
@@ -2624,70 +2612,9 @@ static void handle_read_error(struct mddev *mddev, struct r10bio *r10_bio)
md_error(mddev, rdev);
rdev_dec_pending(rdev, mddev);
-
-read_more:
- rdev = read_balance(conf, r10_bio, &max_sectors);
- if (rdev == NULL) {
- pr_crit_ratelimited("md/raid10:%s: %s: unrecoverable I/O read error for block %llu\n",
- mdname(mddev), b,
- (unsigned long long)r10_bio->sector);
- raid_end_bio_io(r10_bio);
- return;
- }
-
- do_sync = (r10_bio->master_bio->bi_opf & REQ_SYNC);
- slot = r10_bio->read_slot;
- pr_err_ratelimited("md/raid10:%s: %s: redirecting sector %llu to another mirror\n",
- mdname(mddev),
- bdevname(rdev->bdev, b),
- (unsigned long long)r10_bio->sector);
- bio = bio_clone_mddev(r10_bio->master_bio,
- GFP_NOIO, mddev);
- bio_trim(bio, r10_bio->sector - bio->bi_iter.bi_sector, max_sectors);
- r10_bio->devs[slot].bio = bio;
- r10_bio->devs[slot].rdev = rdev;
- bio->bi_iter.bi_sector = r10_bio->devs[slot].addr
- + choose_data_offset(r10_bio, rdev);
- bio->bi_bdev = rdev->bdev;
- bio_set_op_attrs(bio, REQ_OP_READ, do_sync);
- if (test_bit(FailFast, &rdev->flags) &&
- test_bit(R10BIO_FailFast, &r10_bio->state))
- bio->bi_opf |= MD_FAILFAST;
- bio->bi_private = r10_bio;
- bio->bi_end_io = raid10_end_read_request;
- trace_block_bio_remap(bdev_get_queue(bio->bi_bdev),
- bio, bio_dev,
- bio_last_sector - r10_bio->sectors);
-
- if (max_sectors < r10_bio->sectors) {
- /* Drat - have to split this up more */
- struct bio *mbio = r10_bio->master_bio;
- int sectors_handled =
- r10_bio->sector + max_sectors
- - mbio->bi_iter.bi_sector;
- r10_bio->sectors = max_sectors;
- spin_lock_irq(&conf->device_lock);
- if (mbio->bi_phys_segments == 0)
- mbio->bi_phys_segments = 2;
- else
- mbio->bi_phys_segments++;
- spin_unlock_irq(&conf->device_lock);
- generic_make_request(bio);
-
- r10_bio = mempool_alloc(conf->r10bio_pool,
- GFP_NOIO);
- r10_bio->master_bio = mbio;
- r10_bio->sectors = bio_sectors(mbio) - sectors_handled;
- r10_bio->state = 0;
- set_bit(R10BIO_ReadError,
- &r10_bio->state);
- r10_bio->mddev = mddev;
- r10_bio->sector = mbio->bi_iter.bi_sector
- + sectors_handled;
-
- goto read_more;
- } else
- generic_make_request(bio);
+ allow_barrier(conf);
+ r10_bio->state = 0;
+ raid10_read_request(mddev, r10_bio->master_bio, r10_bio);
}
static void handle_write_completed(struct r10conf *conf, struct r10bio *r10_bio)
@@ -2708,7 +2635,7 @@ static void handle_write_completed(struct r10conf *conf, struct r10bio *r10_bio)
rdev = conf->mirrors[dev].rdev;
if (r10_bio->devs[m].bio == NULL)
continue;
- if (!r10_bio->devs[m].bio->bi_error) {
+ if (!r10_bio->devs[m].bio->bi_status) {
rdev_clear_badblocks(
rdev,
r10_bio->devs[m].addr,
@@ -2724,7 +2651,7 @@ static void handle_write_completed(struct r10conf *conf, struct r10bio *r10_bio)
if (r10_bio->devs[m].repl_bio == NULL)
continue;
- if (!r10_bio->devs[m].repl_bio->bi_error) {
+ if (!r10_bio->devs[m].repl_bio->bi_status) {
rdev_clear_badblocks(
rdev,
r10_bio->devs[m].addr,
@@ -2750,7 +2677,7 @@ static void handle_write_completed(struct r10conf *conf, struct r10bio *r10_bio)
r10_bio->devs[m].addr,
r10_bio->sectors, 0);
rdev_dec_pending(rdev, conf->mddev);
- } else if (bio != NULL && bio->bi_error) {
+ } else if (bio != NULL && bio->bi_status) {
fail = true;
if (!narrow_write_error(r10_bio, m)) {
md_error(conf->mddev, rdev);
@@ -2774,6 +2701,11 @@ static void handle_write_completed(struct r10conf *conf, struct r10bio *r10_bio)
list_add(&r10_bio->retry_list, &conf->bio_end_io_list);
conf->nr_queued++;
spin_unlock_irq(&conf->device_lock);
+ /*
+ * In case freeze_array() is waiting for condition
+ * nr_pending == nr_queued + extra to be true.
+ */
+ wake_up(&conf->wait_barrier);
md_wakeup_thread(conf->mddev->thread);
} else {
if (test_bit(R10BIO_WriteError,
@@ -2848,13 +2780,8 @@ static void raid10d(struct md_thread *thread)
recovery_request_write(mddev, r10_bio);
else if (test_bit(R10BIO_ReadError, &r10_bio->state))
handle_read_error(mddev, r10_bio);
- else {
- /* just a partial read to be scheduled from a
- * separate context
- */
- int slot = r10_bio->read_slot;
- generic_make_request(r10_bio->devs[slot].bio);
- }
+ else
+ WARN_ON_ONCE(1);
cond_resched();
if (mddev->sb_flags & ~(1<<MD_SB_CHANGE_PENDING))
@@ -3168,10 +3095,8 @@ static sector_t raid10_sync_request(struct mddev *mddev, sector_t sector_nr,
}
}
bio = r10_bio->devs[0].bio;
- bio_reset(bio);
bio->bi_next = biolist;
biolist = bio;
- bio->bi_private = r10_bio;
bio->bi_end_io = end_sync_read;
bio_set_op_attrs(bio, REQ_OP_READ, 0);
if (test_bit(FailFast, &rdev->flags))
@@ -3195,10 +3120,8 @@ static sector_t raid10_sync_request(struct mddev *mddev, sector_t sector_nr,
if (!test_bit(In_sync, &mrdev->flags)) {
bio = r10_bio->devs[1].bio;
- bio_reset(bio);
bio->bi_next = biolist;
biolist = bio;
- bio->bi_private = r10_bio;
bio->bi_end_io = end_sync_write;
bio_set_op_attrs(bio, REQ_OP_WRITE, 0);
bio->bi_iter.bi_sector = to_addr
@@ -3223,10 +3146,8 @@ static sector_t raid10_sync_request(struct mddev *mddev, sector_t sector_nr,
if (mreplace == NULL || bio == NULL ||
test_bit(Faulty, &mreplace->flags))
break;
- bio_reset(bio);
bio->bi_next = biolist;
biolist = bio;
- bio->bi_private = r10_bio;
bio->bi_end_io = end_sync_write;
bio_set_op_attrs(bio, REQ_OP_WRITE, 0);
bio->bi_iter.bi_sector = to_addr +
@@ -3348,8 +3269,7 @@ static sector_t raid10_sync_request(struct mddev *mddev, sector_t sector_nr,
r10_bio->devs[i].repl_bio->bi_end_io = NULL;
bio = r10_bio->devs[i].bio;
- bio_reset(bio);
- bio->bi_error = -EIO;
+ bio->bi_status = BLK_STS_IOERR;
rcu_read_lock();
rdev = rcu_dereference(conf->mirrors[d].rdev);
if (rdev == NULL || test_bit(Faulty, &rdev->flags)) {
@@ -3373,10 +3293,9 @@ static sector_t raid10_sync_request(struct mddev *mddev, sector_t sector_nr,
atomic_inc(&r10_bio->remaining);
bio->bi_next = biolist;
biolist = bio;
- bio->bi_private = r10_bio;
bio->bi_end_io = end_sync_read;
bio_set_op_attrs(bio, REQ_OP_READ, 0);
- if (test_bit(FailFast, &conf->mirrors[d].rdev->flags))
+ if (test_bit(FailFast, &rdev->flags))
bio->bi_opf |= MD_FAILFAST;
bio->bi_iter.bi_sector = sector + rdev->data_offset;
bio->bi_bdev = rdev->bdev;
@@ -3388,24 +3307,22 @@ static sector_t raid10_sync_request(struct mddev *mddev, sector_t sector_nr,
continue;
}
atomic_inc(&rdev->nr_pending);
- rcu_read_unlock();
/* Need to set up for writing to the replacement */
bio = r10_bio->devs[i].repl_bio;
- bio_reset(bio);
- bio->bi_error = -EIO;
+ bio->bi_status = BLK_STS_IOERR;
sector = r10_bio->devs[i].addr;
bio->bi_next = biolist;
biolist = bio;
- bio->bi_private = r10_bio;
bio->bi_end_io = end_sync_write;
bio_set_op_attrs(bio, REQ_OP_WRITE, 0);
- if (test_bit(FailFast, &conf->mirrors[d].rdev->flags))
+ if (test_bit(FailFast, &rdev->flags))
bio->bi_opf |= MD_FAILFAST;
bio->bi_iter.bi_sector = sector + rdev->data_offset;
bio->bi_bdev = rdev->bdev;
count++;
+ rcu_read_unlock();
}
if (count < 2) {
@@ -3437,27 +3354,17 @@ static sector_t raid10_sync_request(struct mddev *mddev, sector_t sector_nr,
if (len == 0)
break;
for (bio= biolist ; bio ; bio=bio->bi_next) {
- struct bio *bio2;
- page = bio->bi_io_vec[bio->bi_vcnt].bv_page;
- if (bio_add_page(bio, page, len, 0))
- continue;
-
- /* stop here */
- bio->bi_io_vec[bio->bi_vcnt].bv_page = page;
- for (bio2 = biolist;
- bio2 && bio2 != bio;
- bio2 = bio2->bi_next) {
- /* remove last page from this bio */
- bio2->bi_vcnt--;
- bio2->bi_iter.bi_size -= len;
- bio_clear_flag(bio2, BIO_SEG_VALID);
- }
- goto bio_full;
+ struct resync_pages *rp = get_resync_pages(bio);
+ page = resync_fetch_page(rp, rp->idx++);
+ /*
+ * won't fail because the vec table is big enough
+ * to hold all these pages
+ */
+ bio_add_page(bio, page, len, 0);
}
nr_sectors += len>>9;
sector_nr += len>>9;
- } while (biolist->bi_vcnt < RESYNC_PAGES);
- bio_full:
+ } while (get_resync_pages(biolist)->idx < RESYNC_PAGES);
r10_bio->sectors = nr_sectors;
while (biolist) {
@@ -3465,12 +3372,12 @@ static sector_t raid10_sync_request(struct mddev *mddev, sector_t sector_nr,
biolist = biolist->bi_next;
bio->bi_next = NULL;
- r10_bio = bio->bi_private;
+ r10_bio = get_resync_r10bio(bio);
r10_bio->sectors = nr_sectors;
if (bio->bi_end_io == end_sync_read) {
md_sync_acct(bio->bi_bdev, nr_sectors);
- bio->bi_error = 0;
+ bio->bi_status = 0;
generic_make_request(bio);
}
}
@@ -3647,6 +3554,10 @@ static struct r10conf *setup_conf(struct mddev *mddev)
if (!conf->r10bio_pool)
goto out;
+ conf->bio_split = bioset_create(BIO_POOL_SIZE, 0, 0);
+ if (!conf->bio_split)
+ goto out;
+
calc_sectors(conf, mddev->dev_sectors);
if (mddev->reshape_position == MaxSector) {
conf->prev = conf->geo;
@@ -3684,6 +3595,8 @@ static struct r10conf *setup_conf(struct mddev *mddev)
mempool_destroy(conf->r10bio_pool);
kfree(conf->mirrors);
safe_put_page(conf->tmppage);
+ if (conf->bio_split)
+ bioset_free(conf->bio_split);
kfree(conf);
}
return ERR_PTR(err);
@@ -3700,6 +3613,9 @@ static int raid10_run(struct mddev *mddev)
int first = 1;
bool discard_supported = false;
+ if (mddev_init_writes_pending(mddev) < 0)
+ return -ENOMEM;
+
if (mddev->private == NULL) {
conf = setup_conf(mddev);
if (IS_ERR(conf))
@@ -3718,6 +3634,7 @@ static int raid10_run(struct mddev *mddev)
blk_queue_max_discard_sectors(mddev->queue,
mddev->chunk_sectors);
blk_queue_max_write_same_sectors(mddev->queue, 0);
+ blk_queue_max_write_zeroes_sectors(mddev->queue, 0);
blk_queue_io_min(mddev->queue, chunk_size);
if (conf->geo.raid_disks % conf->geo.near_copies)
blk_queue_io_opt(mddev->queue, chunk_size * conf->geo.raid_disks);
@@ -3728,7 +3645,6 @@ static int raid10_run(struct mddev *mddev)
rdev_for_each(rdev, mddev) {
long long diff;
- struct request_queue *q;
disk_idx = rdev->raid_disk;
if (disk_idx < 0)
@@ -3747,7 +3663,6 @@ static int raid10_run(struct mddev *mddev)
goto out_free_conf;
disk->rdev = rdev;
}
- q = bdev_get_queue(rdev->bdev);
diff = (rdev->new_data_offset - rdev->data_offset);
if (!mddev->reshape_backwards)
diff = -diff;
@@ -3764,6 +3679,7 @@ static int raid10_run(struct mddev *mddev)
if (blk_queue_discard(bdev_get_queue(rdev->bdev)))
discard_supported = true;
+ first = 0;
}
if (mddev->queue) {
@@ -3841,8 +3757,8 @@ static int raid10_run(struct mddev *mddev)
* maybe...
*/
stripe /= conf->geo.near_copies;
- if (mddev->queue->backing_dev_info.ra_pages < 2 * stripe)
- mddev->queue->backing_dev_info.ra_pages = 2 * stripe;
+ if (mddev->queue->backing_dev_info->ra_pages < 2 * stripe)
+ mddev->queue->backing_dev_info->ra_pages = 2 * stripe;
}
if (md_integrity_register(mddev))
@@ -3893,6 +3809,8 @@ static void raid10_free(struct mddev *mddev, void *priv)
kfree(conf->mirrors);
kfree(conf->mirrors_old);
kfree(conf->mirrors_new);
+ if (conf->bio_split)
+ bioset_free(conf->bio_split);
kfree(conf);
}
@@ -3944,10 +3862,6 @@ static int raid10_resize(struct mddev *mddev, sector_t sectors)
return ret;
}
md_set_array_sectors(mddev, size);
- if (mddev->queue) {
- set_capacity(mddev->gendisk, mddev->array_sectors);
- revalidate_disk(mddev->gendisk);
- }
if (sectors > mddev->dev_sectors &&
mddev->recovery_cp > oldsize) {
mddev->recovery_cp = oldsize;
@@ -4170,6 +4084,7 @@ static int raid10_start_reshape(struct mddev *mddev)
diff = 0;
if (first || diff < min_offset_diff)
min_offset_diff = diff;
+ first = 0;
}
}
@@ -4360,6 +4275,7 @@ static sector_t reshape_request(struct mddev *mddev, sector_t sector_nr,
struct bio *blist;
struct bio *bio, *read_bio;
int sectors_done = 0;
+ struct page **pages;
if (sector_nr == 0) {
/* If restarting in the middle, skip the initial sectors */
@@ -4480,10 +4396,10 @@ read_more:
read_bio->bi_iter.bi_sector = (r10_bio->devs[r10_bio->read_slot].addr
+ rdev->data_offset);
read_bio->bi_private = r10_bio;
- read_bio->bi_end_io = end_sync_read;
+ read_bio->bi_end_io = end_reshape_read;
bio_set_op_attrs(read_bio, REQ_OP_READ, 0);
read_bio->bi_flags &= (~0UL << BIO_RESET_BITS);
- read_bio->bi_error = 0;
+ read_bio->bi_status = 0;
read_bio->bi_vcnt = 0;
read_bio->bi_iter.bi_size = 0;
r10_bio->master_bio = read_bio;
@@ -4510,11 +4426,9 @@ read_more:
if (!rdev2 || test_bit(Faulty, &rdev2->flags))
continue;
- bio_reset(b);
b->bi_bdev = rdev2->bdev;
b->bi_iter.bi_sector = r10_bio->devs[s/2].addr +
rdev2->new_data_offset;
- b->bi_private = r10_bio;
b->bi_end_io = end_reshape_write;
bio_set_op_attrs(b, REQ_OP_WRITE, 0);
b->bi_next = blist;
@@ -4524,31 +4438,22 @@ read_more:
/* Now add as many pages as possible to all of these bios. */
nr_sectors = 0;
+ pages = get_resync_pages(r10_bio->devs[0].bio)->pages;
for (s = 0 ; s < max_sectors; s += PAGE_SIZE >> 9) {
- struct page *page = r10_bio->devs[0].bio->bi_io_vec[s/(PAGE_SIZE>>9)].bv_page;
+ struct page *page = pages[s / (PAGE_SIZE >> 9)];
int len = (max_sectors - s) << 9;
if (len > PAGE_SIZE)
len = PAGE_SIZE;
for (bio = blist; bio ; bio = bio->bi_next) {
- struct bio *bio2;
- if (bio_add_page(bio, page, len, 0))
- continue;
-
- /* Didn't fit, must stop */
- for (bio2 = blist;
- bio2 && bio2 != bio;
- bio2 = bio2->bi_next) {
- /* Remove last page from this bio */
- bio2->bi_vcnt--;
- bio2->bi_iter.bi_size -= len;
- bio_clear_flag(bio2, BIO_SEG_VALID);
- }
- goto bio_full;
+ /*
+ * won't fail because the vec table is big enough
+ * to hold all these pages
+ */
+ bio_add_page(bio, page, len, 0);
}
sector_nr += len >> 9;
nr_sectors += len >> 9;
}
-bio_full:
rcu_read_unlock();
r10_bio->sectors = nr_sectors;
@@ -4643,8 +4548,8 @@ static void end_reshape(struct r10conf *conf)
int stripe = conf->geo.raid_disks *
((conf->mddev->chunk_sectors << 9) / PAGE_SIZE);
stripe /= conf->geo.near_copies;
- if (conf->mddev->queue->backing_dev_info.ra_pages < 2 * stripe)
- conf->mddev->queue->backing_dev_info.ra_pages = 2 * stripe;
+ if (conf->mddev->queue->backing_dev_info->ra_pages < 2 * stripe)
+ conf->mddev->queue->backing_dev_info->ra_pages = 2 * stripe;
}
conf->fullsync = 0;
}
@@ -4662,7 +4567,10 @@ static int handle_reshape_read_error(struct mddev *mddev,
struct r10bio *r10b = &on_stack.r10_bio;
int slot = 0;
int idx = 0;
- struct bio_vec *bvec = r10_bio->master_bio->bi_io_vec;
+ struct page **pages;
+
+ /* reshape IOs share pages from .devs[0].bio */
+ pages = get_resync_pages(r10_bio->devs[0].bio)->pages;
r10b->sector = r10_bio->sector;
__raid10_find_phys(&conf->prev, r10b);
@@ -4691,7 +4599,7 @@ static int handle_reshape_read_error(struct mddev *mddev,
success = sync_page_io(rdev,
addr,
s << 9,
- bvec[idx].bv_page,
+ pages[idx],
REQ_OP_READ, 0, false);
rdev_dec_pending(rdev, mddev);
rcu_read_lock();
@@ -4719,7 +4627,7 @@ static int handle_reshape_read_error(struct mddev *mddev,
static void end_reshape_write(struct bio *bio)
{
- struct r10bio *r10_bio = bio->bi_private;
+ struct r10bio *r10_bio = get_resync_r10bio(bio);
struct mddev *mddev = r10_bio->mddev;
struct r10conf *conf = mddev->private;
int d;
@@ -4735,7 +4643,7 @@ static void end_reshape_write(struct bio *bio)
rdev = conf->mirrors[d].rdev;
}
- if (bio->bi_error) {
+ if (bio->bi_status) {
/* FIXME should record badblock */
md_error(mddev, rdev);
}
diff --git a/drivers/md/raid10.h b/drivers/md/raid10.h
index 3162615e57bd..735ce1a3d260 100644
--- a/drivers/md/raid10.h
+++ b/drivers/md/raid10.h
@@ -82,6 +82,7 @@ struct r10conf {
mempool_t *r10bio_pool;
mempool_t *r10buf_pool;
struct page *tmppage;
+ struct bio_set *bio_split;
/* When taking over an array from a different personality, we store
* the new thread here until we fully activate the array.
diff --git a/drivers/md/raid5-cache.c b/drivers/md/raid5-cache.c
index 302dea3296ba..bfa1e907c472 100644
--- a/drivers/md/raid5-cache.c
+++ b/drivers/md/raid5-cache.c
@@ -20,15 +20,18 @@
#include <linux/crc32c.h>
#include <linux/random.h>
#include <linux/kthread.h>
+#include <linux/types.h>
#include "md.h"
#include "raid5.h"
#include "bitmap.h"
+#include "raid5-log.h"
/*
* metadata/data stored in disk with 4k size unit (a block) regardless
* underneath hardware sector size. only works with PAGE_SIZE == 4096
*/
#define BLOCK_SECTORS (8)
+#define BLOCK_SECTOR_SHIFT (3)
/*
* log->max_free_space is min(1/4 disk size, 10G reclaimable space).
@@ -42,7 +45,7 @@
/* wake up reclaim thread periodically */
#define R5C_RECLAIM_WAKEUP_INTERVAL (30 * HZ)
/* start flush with these full stripes */
-#define R5C_FULL_STRIPE_FLUSH_BATCH 256
+#define R5C_FULL_STRIPE_FLUSH_BATCH(conf) (conf->max_nr_stripes / 4)
/* reclaim stripes in groups */
#define R5C_RECLAIM_STRIPE_GROUP (NR_STRIPE_HASH_LOCKS * 2)
@@ -52,16 +55,6 @@
*/
#define R5L_POOL_SIZE 4
-/*
- * r5c journal modes of the array: write-back or write-through.
- * write-through mode has identical behavior as existing log only
- * implementation.
- */
-enum r5c_journal_mode {
- R5C_JOURNAL_MODE_WRITE_THROUGH = 0,
- R5C_JOURNAL_MODE_WRITE_BACK = 1,
-};
-
static char *r5c_journal_mode_str[] = {"write-through",
"write-back"};
/*
@@ -164,9 +157,60 @@ struct r5l_log {
struct work_struct deferred_io_work;
/* to disable write back during in degraded mode */
struct work_struct disable_writeback_work;
+
+ /* to for chunk_aligned_read in writeback mode, details below */
+ spinlock_t tree_lock;
+ struct radix_tree_root big_stripe_tree;
};
/*
+ * Enable chunk_aligned_read() with write back cache.
+ *
+ * Each chunk may contain more than one stripe (for example, a 256kB
+ * chunk contains 64 4kB-page, so this chunk contain 64 stripes). For
+ * chunk_aligned_read, these stripes are grouped into one "big_stripe".
+ * For each big_stripe, we count how many stripes of this big_stripe
+ * are in the write back cache. These data are tracked in a radix tree
+ * (big_stripe_tree). We use radix_tree item pointer as the counter.
+ * r5c_tree_index() is used to calculate keys for the radix tree.
+ *
+ * chunk_aligned_read() calls r5c_big_stripe_cached() to look up
+ * big_stripe of each chunk in the tree. If this big_stripe is in the
+ * tree, chunk_aligned_read() aborts. This look up is protected by
+ * rcu_read_lock().
+ *
+ * It is necessary to remember whether a stripe is counted in
+ * big_stripe_tree. Instead of adding new flag, we reuses existing flags:
+ * STRIPE_R5C_PARTIAL_STRIPE and STRIPE_R5C_FULL_STRIPE. If either of these
+ * two flags are set, the stripe is counted in big_stripe_tree. This
+ * requires moving set_bit(STRIPE_R5C_PARTIAL_STRIPE) to
+ * r5c_try_caching_write(); and moving clear_bit of
+ * STRIPE_R5C_PARTIAL_STRIPE and STRIPE_R5C_FULL_STRIPE to
+ * r5c_finish_stripe_write_out().
+ */
+
+/*
+ * radix tree requests lowest 2 bits of data pointer to be 2b'00.
+ * So it is necessary to left shift the counter by 2 bits before using it
+ * as data pointer of the tree.
+ */
+#define R5C_RADIX_COUNT_SHIFT 2
+
+/*
+ * calculate key for big_stripe_tree
+ *
+ * sect: align_bi->bi_iter.bi_sector or sh->sector
+ */
+static inline sector_t r5c_tree_index(struct r5conf *conf,
+ sector_t sect)
+{
+ sector_t offset;
+
+ offset = sector_div(sect, conf->chunk_sectors);
+ return sect;
+}
+
+/*
* an IO range starts from a meta data block and end at the next meta data
* block. The io unit's the meta data block tracks data/parity followed it. io
* unit is written to log disk with normal write, as we always flush log disk
@@ -255,8 +299,7 @@ static void __r5l_set_io_unit_state(struct r5l_io_unit *io,
}
static void
-r5c_return_dev_pending_writes(struct r5conf *conf, struct r5dev *dev,
- struct bio_list *return_bi)
+r5c_return_dev_pending_writes(struct r5conf *conf, struct r5dev *dev)
{
struct bio *wbi, *wbi2;
@@ -265,24 +308,21 @@ r5c_return_dev_pending_writes(struct r5conf *conf, struct r5dev *dev,
while (wbi && wbi->bi_iter.bi_sector <
dev->sector + STRIPE_SECTORS) {
wbi2 = r5_next_bio(wbi, dev->sector);
- if (!raid5_dec_bi_active_stripes(wbi)) {
- md_write_end(conf->mddev);
- bio_list_add(return_bi, wbi);
- }
+ md_write_end(conf->mddev);
+ bio_endio(wbi);
wbi = wbi2;
}
}
void r5c_handle_cached_data_endio(struct r5conf *conf,
- struct stripe_head *sh, int disks, struct bio_list *return_bi)
+ struct stripe_head *sh, int disks)
{
int i;
for (i = sh->disks; i--; ) {
if (sh->dev[i].written) {
set_bit(R5_UPTODATE, &sh->dev[i].flags);
- r5c_return_dev_pending_writes(conf, &sh->dev[i],
- return_bi);
+ r5c_return_dev_pending_writes(conf, &sh->dev[i]);
bitmap_endwrite(conf->mddev->bitmap, sh->sector,
STRIPE_SECTORS,
!test_bit(STRIPE_DEGRADED, &sh->state),
@@ -291,6 +331,8 @@ void r5c_handle_cached_data_endio(struct r5conf *conf,
}
}
+void r5l_wake_reclaim(struct r5l_log *log, sector_t space);
+
/* Check whether we should flush some stripes to free up stripe cache */
void r5c_check_stripe_cache_usage(struct r5conf *conf)
{
@@ -329,7 +371,7 @@ void r5c_check_cached_full_stripe(struct r5conf *conf)
* or a full stripe (chunk size / 4k stripes).
*/
if (atomic_read(&conf->r5c_cached_full_stripes) >=
- min(R5C_FULL_STRIPE_FLUSH_BATCH,
+ min(R5C_FULL_STRIPE_FLUSH_BATCH(conf),
conf->chunk_sectors >> STRIPE_SHIFT))
r5l_wake_reclaim(conf->log, 0);
}
@@ -337,17 +379,30 @@ void r5c_check_cached_full_stripe(struct r5conf *conf)
/*
* Total log space (in sectors) needed to flush all data in cache
*
- * Currently, writing-out phase automatically includes all pending writes
- * to the same sector. So the reclaim of each stripe takes up to
- * (conf->raid_disks + 1) pages of log space.
+ * To avoid deadlock due to log space, it is necessary to reserve log
+ * space to flush critical stripes (stripes that occupying log space near
+ * last_checkpoint). This function helps check how much log space is
+ * required to flush all cached stripes.
+ *
+ * To reduce log space requirements, two mechanisms are used to give cache
+ * flush higher priorities:
+ * 1. In handle_stripe_dirtying() and schedule_reconstruction(),
+ * stripes ALREADY in journal can be flushed w/o pending writes;
+ * 2. In r5l_write_stripe() and r5c_cache_data(), stripes NOT in journal
+ * can be delayed (r5l_add_no_space_stripe).
*
- * To totally avoid deadlock due to log space, the code reserves
- * (conf->raid_disks + 1) pages for each stripe in cache, which is not
- * necessary in most cases.
+ * In cache flush, the stripe goes through 1 and then 2. For a stripe that
+ * already passed 1, flushing it requires at most (conf->max_degraded + 1)
+ * pages of journal space. For stripes that has not passed 1, flushing it
+ * requires (conf->raid_disks + 1) pages of journal space. There are at
+ * most (conf->group_cnt + 1) stripe that passed 1. So total journal space
+ * required to flush all cached stripes (in pages) is:
*
- * To improve this, we will need writing-out phase to be able to NOT include
- * pending writes, which will reduce the requirement to
- * (conf->max_degraded + 1) pages per stripe in cache.
+ * (stripe_in_journal_count - group_cnt - 1) * (max_degraded + 1) +
+ * (group_cnt + 1) * (raid_disks + 1)
+ * or
+ * (stripe_in_journal_count) * (max_degraded + 1) +
+ * (group_cnt + 1) * (raid_disks - max_degraded)
*/
static sector_t r5c_log_required_to_flush_cache(struct r5conf *conf)
{
@@ -356,8 +411,9 @@ static sector_t r5c_log_required_to_flush_cache(struct r5conf *conf)
if (!r5c_is_writeback(log))
return 0;
- return BLOCK_SECTORS * (conf->raid_disks + 1) *
- atomic_read(&log->stripe_in_journal_count);
+ return BLOCK_SECTORS *
+ ((conf->max_degraded + 1) * atomic_read(&log->stripe_in_journal_count) +
+ (conf->raid_disks - conf->max_degraded) * (conf->group_cnt + 1));
}
/*
@@ -412,16 +468,6 @@ void r5c_make_stripe_write_out(struct stripe_head *sh)
if (!test_and_set_bit(STRIPE_PREREAD_ACTIVE, &sh->state))
atomic_inc(&conf->preread_active_stripes);
-
- if (test_and_clear_bit(STRIPE_R5C_PARTIAL_STRIPE, &sh->state)) {
- BUG_ON(atomic_read(&conf->r5c_cached_partial_stripes) == 0);
- atomic_dec(&conf->r5c_cached_partial_stripes);
- }
-
- if (test_and_clear_bit(STRIPE_R5C_FULL_STRIPE, &sh->state)) {
- BUG_ON(atomic_read(&conf->r5c_cached_full_stripes) == 0);
- atomic_dec(&conf->r5c_cached_full_stripes);
- }
}
static void r5c_handle_data_cached(struct stripe_head *sh)
@@ -526,7 +572,7 @@ static void r5l_log_endio(struct bio *bio)
struct r5l_log *log = io->log;
unsigned long flags;
- if (bio->bi_error)
+ if (bio->bi_status)
md_error(log->rdev->mddev, log->rdev);
bio_put(bio);
@@ -534,7 +580,7 @@ static void r5l_log_endio(struct bio *bio)
spin_lock_irqsave(&log->io_list_lock, flags);
__r5l_set_io_unit_state(io, IO_UNIT_IO_END);
- if (log->need_cache_flush)
+ if (log->need_cache_flush && !list_empty(&io->stripe_list))
r5l_move_to_end_ios(log);
else
r5l_log_run_stripes(log);
@@ -562,9 +608,11 @@ static void r5l_log_endio(struct bio *bio)
bio_endio(bi);
atomic_dec(&io->pending_stripe);
}
- if (atomic_read(&io->pending_stripe) == 0)
- __r5l_stripe_write_finished(io);
}
+
+ /* finish flush only io_unit and PAYLOAD_FLUSH only io_unit */
+ if (atomic_read(&io->pending_stripe) == 0)
+ __r5l_stripe_write_finished(io);
}
static void r5l_do_submit_io(struct r5l_log *log, struct r5l_io_unit *io)
@@ -575,20 +623,30 @@ static void r5l_do_submit_io(struct r5l_log *log, struct r5l_io_unit *io)
__r5l_set_io_unit_state(io, IO_UNIT_IO_START);
spin_unlock_irqrestore(&log->io_list_lock, flags);
+ /*
+ * In case of journal device failures, submit_bio will get error
+ * and calls endio, then active stripes will continue write
+ * process. Therefore, it is not necessary to check Faulty bit
+ * of journal device here.
+ *
+ * We can't check split_bio after current_bio is submitted. If
+ * io->split_bio is null, after current_bio is submitted, current_bio
+ * might already be completed and the io_unit is freed. We submit
+ * split_bio first to avoid the issue.
+ */
+ if (io->split_bio) {
+ if (io->has_flush)
+ io->split_bio->bi_opf |= REQ_PREFLUSH;
+ if (io->has_fua)
+ io->split_bio->bi_opf |= REQ_FUA;
+ submit_bio(io->split_bio);
+ }
+
if (io->has_flush)
io->current_bio->bi_opf |= REQ_PREFLUSH;
if (io->has_fua)
io->current_bio->bi_opf |= REQ_FUA;
submit_bio(io->current_bio);
-
- if (!io->split_bio)
- return;
-
- if (io->has_flush)
- io->split_bio->bi_opf |= REQ_PREFLUSH;
- if (io->has_fua)
- io->split_bio->bi_opf |= REQ_FUA;
- submit_bio(io->split_bio);
}
/* deferred io_unit will be dispatched here */
@@ -623,6 +681,11 @@ static void r5c_disable_writeback_async(struct work_struct *work)
return;
pr_info("md/raid:%s: Disabling writeback cache for degraded array.\n",
mdname(mddev));
+
+ /* wait superblock change before suspend */
+ wait_event(mddev->sb_wait,
+ !test_bit(MD_SB_CHANGE_PENDING, &mddev->sb_flags));
+
mddev_suspend(mddev);
log->r5c_journal_mode = R5C_JOURNAL_MODE_WRITE_THROUGH;
mddev_resume(mddev);
@@ -786,6 +849,41 @@ static void r5l_append_payload_page(struct r5l_log *log, struct page *page)
r5_reserve_log_entry(log, io);
}
+static void r5l_append_flush_payload(struct r5l_log *log, sector_t sect)
+{
+ struct mddev *mddev = log->rdev->mddev;
+ struct r5conf *conf = mddev->private;
+ struct r5l_io_unit *io;
+ struct r5l_payload_flush *payload;
+ int meta_size;
+
+ /*
+ * payload_flush requires extra writes to the journal.
+ * To avoid handling the extra IO in quiesce, just skip
+ * flush_payload
+ */
+ if (conf->quiesce)
+ return;
+
+ mutex_lock(&log->io_mutex);
+ meta_size = sizeof(struct r5l_payload_flush) + sizeof(__le64);
+
+ if (r5l_get_meta(log, meta_size)) {
+ mutex_unlock(&log->io_mutex);
+ return;
+ }
+
+ /* current implementation is one stripe per flush payload */
+ io = log->current_io;
+ payload = page_address(io->meta_page) + io->meta_offset;
+ payload->header.type = cpu_to_le16(R5LOG_PAYLOAD_FLUSH);
+ payload->header.flags = cpu_to_le16(0);
+ payload->size = cpu_to_le32(sizeof(__le64));
+ payload->flush_stripes[0] = cpu_to_le64(sect);
+ io->meta_offset += meta_size;
+ mutex_unlock(&log->io_mutex);
+}
+
static int r5l_log_stripe(struct r5l_log *log, struct stripe_head *sh,
int data_pages, int parity_pages)
{
@@ -1149,7 +1247,7 @@ static void r5l_log_flush_endio(struct bio *bio)
unsigned long flags;
struct r5l_io_unit *io;
- if (bio->bi_error)
+ if (bio->bi_status)
md_error(log->rdev->mddev, log->rdev);
spin_lock_irqsave(&log->io_list_lock, flags);
@@ -1271,6 +1369,10 @@ static void r5c_flush_stripe(struct r5conf *conf, struct stripe_head *sh)
atomic_inc(&conf->active_stripes);
r5c_make_stripe_write_out(sh);
+ if (test_bit(STRIPE_R5C_PARTIAL_STRIPE, &sh->state))
+ atomic_inc(&conf->r5c_flushing_partial_stripes);
+ else
+ atomic_inc(&conf->r5c_flushing_full_stripes);
raid5_release_stripe(sh);
}
@@ -1313,12 +1415,16 @@ static void r5c_do_reclaim(struct r5conf *conf)
unsigned long flags;
int total_cached;
int stripes_to_flush;
+ int flushing_partial, flushing_full;
if (!r5c_is_writeback(log))
return;
+ flushing_partial = atomic_read(&conf->r5c_flushing_partial_stripes);
+ flushing_full = atomic_read(&conf->r5c_flushing_full_stripes);
total_cached = atomic_read(&conf->r5c_cached_partial_stripes) +
- atomic_read(&conf->r5c_cached_full_stripes);
+ atomic_read(&conf->r5c_cached_full_stripes) -
+ flushing_full - flushing_partial;
if (total_cached > conf->min_nr_stripes * 3 / 4 ||
atomic_read(&conf->empty_inactive_list_nr) > 0)
@@ -1328,8 +1434,8 @@ static void r5c_do_reclaim(struct r5conf *conf)
*/
stripes_to_flush = R5C_RECLAIM_STRIPE_GROUP;
else if (total_cached > conf->min_nr_stripes * 1 / 2 ||
- atomic_read(&conf->r5c_cached_full_stripes) >
- R5C_FULL_STRIPE_FLUSH_BATCH)
+ atomic_read(&conf->r5c_cached_full_stripes) - flushing_full >
+ R5C_FULL_STRIPE_FLUSH_BATCH(conf))
/*
* if stripe cache pressure moderate, or if there is many full
* stripes,flush all full stripes
@@ -1362,9 +1468,9 @@ static void r5c_do_reclaim(struct r5conf *conf)
!test_bit(STRIPE_HANDLE, &sh->state) &&
atomic_read(&sh->count) == 0) {
r5c_flush_stripe(conf, sh);
+ if (count++ >= R5C_RECLAIM_STRIPE_GROUP)
+ break;
}
- if (count++ >= R5C_RECLAIM_STRIPE_GROUP)
- break;
}
spin_unlock(&conf->device_lock);
spin_unlock_irqrestore(&log->stripe_in_journal_lock, flags);
@@ -1488,6 +1594,8 @@ bool r5l_log_disk_error(struct r5conf *conf)
return ret;
}
+#define R5L_RECOVERY_PAGE_POOL_SIZE 256
+
struct r5l_recovery_ctx {
struct page *meta_page; /* current meta */
sector_t meta_total_blocks; /* total size of current meta and data */
@@ -1496,18 +1604,131 @@ struct r5l_recovery_ctx {
int data_parity_stripes; /* number of data_parity stripes */
int data_only_stripes; /* number of data_only stripes */
struct list_head cached_list;
+
+ /*
+ * read ahead page pool (ra_pool)
+ * in recovery, log is read sequentially. It is not efficient to
+ * read every page with sync_page_io(). The read ahead page pool
+ * reads multiple pages with one IO, so further log read can
+ * just copy data from the pool.
+ */
+ struct page *ra_pool[R5L_RECOVERY_PAGE_POOL_SIZE];
+ sector_t pool_offset; /* offset of first page in the pool */
+ int total_pages; /* total allocated pages */
+ int valid_pages; /* pages with valid data */
+ struct bio *ra_bio; /* bio to do the read ahead */
};
+static int r5l_recovery_allocate_ra_pool(struct r5l_log *log,
+ struct r5l_recovery_ctx *ctx)
+{
+ struct page *page;
+
+ ctx->ra_bio = bio_alloc_bioset(GFP_KERNEL, BIO_MAX_PAGES, log->bs);
+ if (!ctx->ra_bio)
+ return -ENOMEM;
+
+ ctx->valid_pages = 0;
+ ctx->total_pages = 0;
+ while (ctx->total_pages < R5L_RECOVERY_PAGE_POOL_SIZE) {
+ page = alloc_page(GFP_KERNEL);
+
+ if (!page)
+ break;
+ ctx->ra_pool[ctx->total_pages] = page;
+ ctx->total_pages += 1;
+ }
+
+ if (ctx->total_pages == 0) {
+ bio_put(ctx->ra_bio);
+ return -ENOMEM;
+ }
+
+ ctx->pool_offset = 0;
+ return 0;
+}
+
+static void r5l_recovery_free_ra_pool(struct r5l_log *log,
+ struct r5l_recovery_ctx *ctx)
+{
+ int i;
+
+ for (i = 0; i < ctx->total_pages; ++i)
+ put_page(ctx->ra_pool[i]);
+ bio_put(ctx->ra_bio);
+}
+
+/*
+ * fetch ctx->valid_pages pages from offset
+ * In normal cases, ctx->valid_pages == ctx->total_pages after the call.
+ * However, if the offset is close to the end of the journal device,
+ * ctx->valid_pages could be smaller than ctx->total_pages
+ */
+static int r5l_recovery_fetch_ra_pool(struct r5l_log *log,
+ struct r5l_recovery_ctx *ctx,
+ sector_t offset)
+{
+ bio_reset(ctx->ra_bio);
+ ctx->ra_bio->bi_bdev = log->rdev->bdev;
+ bio_set_op_attrs(ctx->ra_bio, REQ_OP_READ, 0);
+ ctx->ra_bio->bi_iter.bi_sector = log->rdev->data_offset + offset;
+
+ ctx->valid_pages = 0;
+ ctx->pool_offset = offset;
+
+ while (ctx->valid_pages < ctx->total_pages) {
+ bio_add_page(ctx->ra_bio,
+ ctx->ra_pool[ctx->valid_pages], PAGE_SIZE, 0);
+ ctx->valid_pages += 1;
+
+ offset = r5l_ring_add(log, offset, BLOCK_SECTORS);
+
+ if (offset == 0) /* reached end of the device */
+ break;
+ }
+
+ return submit_bio_wait(ctx->ra_bio);
+}
+
+/*
+ * try read a page from the read ahead page pool, if the page is not in the
+ * pool, call r5l_recovery_fetch_ra_pool
+ */
+static int r5l_recovery_read_page(struct r5l_log *log,
+ struct r5l_recovery_ctx *ctx,
+ struct page *page,
+ sector_t offset)
+{
+ int ret;
+
+ if (offset < ctx->pool_offset ||
+ offset >= ctx->pool_offset + ctx->valid_pages * BLOCK_SECTORS) {
+ ret = r5l_recovery_fetch_ra_pool(log, ctx, offset);
+ if (ret)
+ return ret;
+ }
+
+ BUG_ON(offset < ctx->pool_offset ||
+ offset >= ctx->pool_offset + ctx->valid_pages * BLOCK_SECTORS);
+
+ memcpy(page_address(page),
+ page_address(ctx->ra_pool[(offset - ctx->pool_offset) >>
+ BLOCK_SECTOR_SHIFT]),
+ PAGE_SIZE);
+ return 0;
+}
+
static int r5l_recovery_read_meta_block(struct r5l_log *log,
struct r5l_recovery_ctx *ctx)
{
struct page *page = ctx->meta_page;
struct r5l_meta_block *mb;
u32 crc, stored_crc;
+ int ret;
- if (!sync_page_io(log->rdev, ctx->pos, PAGE_SIZE, page, REQ_OP_READ, 0,
- false))
- return -EIO;
+ ret = r5l_recovery_read_page(log, ctx, page, ctx->pos);
+ if (ret != 0)
+ return ret;
mb = page_address(page);
stored_crc = le32_to_cpu(mb->checksum);
@@ -1561,7 +1782,7 @@ static int r5l_log_write_empty_meta_block(struct r5l_log *log, sector_t pos,
mb->checksum = cpu_to_le32(crc32c_le(log->uuid_checksum,
mb, PAGE_SIZE));
if (!sync_page_io(log->rdev, pos, PAGE_SIZE, page, REQ_OP_WRITE,
- REQ_FUA, false)) {
+ REQ_SYNC | REQ_FUA, false)) {
__free_page(page);
return -EIO;
}
@@ -1589,8 +1810,7 @@ static void r5l_recovery_load_data(struct r5l_log *log,
raid5_compute_sector(conf,
le64_to_cpu(payload->location), 0,
&dd_idx, sh);
- sync_page_io(log->rdev, log_offset, PAGE_SIZE,
- sh->dev[dd_idx].page, REQ_OP_READ, 0, false);
+ r5l_recovery_read_page(log, ctx, sh->dev[dd_idx].page, log_offset);
sh->dev[dd_idx].log_checksum =
le32_to_cpu(payload->checksum[0]);
ctx->meta_total_blocks += BLOCK_SECTORS;
@@ -1609,17 +1829,15 @@ static void r5l_recovery_load_parity(struct r5l_log *log,
struct r5conf *conf = mddev->private;
ctx->meta_total_blocks += BLOCK_SECTORS * conf->max_degraded;
- sync_page_io(log->rdev, log_offset, PAGE_SIZE,
- sh->dev[sh->pd_idx].page, REQ_OP_READ, 0, false);
+ r5l_recovery_read_page(log, ctx, sh->dev[sh->pd_idx].page, log_offset);
sh->dev[sh->pd_idx].log_checksum =
le32_to_cpu(payload->checksum[0]);
set_bit(R5_Wantwrite, &sh->dev[sh->pd_idx].flags);
if (sh->qd_idx >= 0) {
- sync_page_io(log->rdev,
- r5l_ring_add(log, log_offset, BLOCK_SECTORS),
- PAGE_SIZE, sh->dev[sh->qd_idx].page,
- REQ_OP_READ, 0, false);
+ r5l_recovery_read_page(
+ log, ctx, sh->dev[sh->qd_idx].page,
+ r5l_ring_add(log, log_offset, BLOCK_SECTORS));
sh->dev[sh->qd_idx].log_checksum =
le32_to_cpu(payload->checksum[1]);
set_bit(R5_Wantwrite, &sh->dev[sh->qd_idx].flags);
@@ -1750,14 +1968,15 @@ r5c_recovery_replay_stripes(struct list_head *cached_stripe_list,
/* if matches return 0; otherwise return -EINVAL */
static int
-r5l_recovery_verify_data_checksum(struct r5l_log *log, struct page *page,
+r5l_recovery_verify_data_checksum(struct r5l_log *log,
+ struct r5l_recovery_ctx *ctx,
+ struct page *page,
sector_t log_offset, __le32 log_checksum)
{
void *addr;
u32 checksum;
- sync_page_io(log->rdev, log_offset, PAGE_SIZE,
- page, REQ_OP_READ, 0, false);
+ r5l_recovery_read_page(log, ctx, page, log_offset);
addr = kmap_atomic(page);
checksum = crc32c_le(log->uuid_checksum, addr, PAGE_SIZE);
kunmap_atomic(addr);
@@ -1779,6 +1998,7 @@ r5l_recovery_verify_data_checksum_for_mb(struct r5l_log *log,
sector_t log_offset = r5l_ring_add(log, ctx->pos, BLOCK_SECTORS);
struct page *page;
struct r5l_payload_data_parity *payload;
+ struct r5l_payload_flush *payload_flush;
page = alloc_page(GFP_KERNEL);
if (!page)
@@ -1786,33 +2006,42 @@ r5l_recovery_verify_data_checksum_for_mb(struct r5l_log *log,
while (mb_offset < le32_to_cpu(mb->meta_size)) {
payload = (void *)mb + mb_offset;
+ payload_flush = (void *)mb + mb_offset;
- if (payload->header.type == R5LOG_PAYLOAD_DATA) {
+ if (le16_to_cpu(payload->header.type) == R5LOG_PAYLOAD_DATA) {
if (r5l_recovery_verify_data_checksum(
- log, page, log_offset,
+ log, ctx, page, log_offset,
payload->checksum[0]) < 0)
goto mismatch;
- } else if (payload->header.type == R5LOG_PAYLOAD_PARITY) {
+ } else if (le16_to_cpu(payload->header.type) == R5LOG_PAYLOAD_PARITY) {
if (r5l_recovery_verify_data_checksum(
- log, page, log_offset,
+ log, ctx, page, log_offset,
payload->checksum[0]) < 0)
goto mismatch;
if (conf->max_degraded == 2 && /* q for RAID 6 */
r5l_recovery_verify_data_checksum(
- log, page,
+ log, ctx, page,
r5l_ring_add(log, log_offset,
BLOCK_SECTORS),
payload->checksum[1]) < 0)
goto mismatch;
- } else /* not R5LOG_PAYLOAD_DATA or R5LOG_PAYLOAD_PARITY */
+ } else if (le16_to_cpu(payload->header.type) == R5LOG_PAYLOAD_FLUSH) {
+ /* nothing to do for R5LOG_PAYLOAD_FLUSH here */
+ } else /* not R5LOG_PAYLOAD_DATA/PARITY/FLUSH */
goto mismatch;
- log_offset = r5l_ring_add(log, log_offset,
- le32_to_cpu(payload->size));
+ if (le16_to_cpu(payload->header.type) == R5LOG_PAYLOAD_FLUSH) {
+ mb_offset += sizeof(struct r5l_payload_flush) +
+ le32_to_cpu(payload_flush->size);
+ } else {
+ /* DATA or PARITY payload */
+ log_offset = r5l_ring_add(log, log_offset,
+ le32_to_cpu(payload->size));
+ mb_offset += sizeof(struct r5l_payload_data_parity) +
+ sizeof(__le32) *
+ (le32_to_cpu(payload->size) >> (PAGE_SHIFT - 9));
+ }
- mb_offset += sizeof(struct r5l_payload_data_parity) +
- sizeof(__le32) *
- (le32_to_cpu(payload->size) >> (PAGE_SHIFT - 9));
}
put_page(page);
@@ -1840,6 +2069,7 @@ r5c_recovery_analyze_meta_block(struct r5l_log *log,
struct r5conf *conf = mddev->private;
struct r5l_meta_block *mb;
struct r5l_payload_data_parity *payload;
+ struct r5l_payload_flush *payload_flush;
int mb_offset;
sector_t log_offset;
sector_t stripe_sect;
@@ -1865,7 +2095,31 @@ r5c_recovery_analyze_meta_block(struct r5l_log *log,
int dd;
payload = (void *)mb + mb_offset;
- stripe_sect = (payload->header.type == R5LOG_PAYLOAD_DATA) ?
+ payload_flush = (void *)mb + mb_offset;
+
+ if (le16_to_cpu(payload->header.type) == R5LOG_PAYLOAD_FLUSH) {
+ int i, count;
+
+ count = le32_to_cpu(payload_flush->size) / sizeof(__le64);
+ for (i = 0; i < count; ++i) {
+ stripe_sect = le64_to_cpu(payload_flush->flush_stripes[i]);
+ sh = r5c_recovery_lookup_stripe(cached_stripe_list,
+ stripe_sect);
+ if (sh) {
+ WARN_ON(test_bit(STRIPE_R5C_CACHING, &sh->state));
+ r5l_recovery_reset_stripe(sh);
+ list_del_init(&sh->lru);
+ raid5_release_stripe(sh);
+ }
+ }
+
+ mb_offset += sizeof(struct r5l_payload_flush) +
+ le32_to_cpu(payload_flush->size);
+ continue;
+ }
+
+ /* DATA or PARITY payload */
+ stripe_sect = (le16_to_cpu(payload->header.type) == R5LOG_PAYLOAD_DATA) ?
raid5_compute_sector(
conf, le64_to_cpu(payload->location), 0, &dd,
NULL)
@@ -1903,7 +2157,7 @@ r5c_recovery_analyze_meta_block(struct r5l_log *log,
list_add_tail(&sh->lru, cached_stripe_list);
}
- if (payload->header.type == R5LOG_PAYLOAD_DATA) {
+ if (le16_to_cpu(payload->header.type) == R5LOG_PAYLOAD_DATA) {
if (!test_bit(STRIPE_R5C_CACHING, &sh->state) &&
test_bit(R5_Wantwrite, &sh->dev[sh->pd_idx].flags)) {
r5l_recovery_replay_one_stripe(conf, sh, ctx);
@@ -1911,7 +2165,7 @@ r5c_recovery_analyze_meta_block(struct r5l_log *log,
}
r5l_recovery_load_data(log, sh, ctx, payload,
log_offset);
- } else if (payload->header.type == R5LOG_PAYLOAD_PARITY)
+ } else if (le16_to_cpu(payload->header.type) == R5LOG_PAYLOAD_PARITY)
r5l_recovery_load_parity(log, sh, ctx, payload,
log_offset);
else
@@ -2113,7 +2367,7 @@ r5c_recovery_rewrite_data_only_stripes(struct r5l_log *log,
payload = (void *)mb + offset;
payload->header.type = cpu_to_le16(
R5LOG_PAYLOAD_DATA);
- payload->size = BLOCK_SECTORS;
+ payload->size = cpu_to_le32(BLOCK_SECTORS);
payload->location = cpu_to_le64(
raid5_compute_blocknr(sh, i, 0));
addr = kmap_atomic(dev->page);
@@ -2134,7 +2388,7 @@ r5c_recovery_rewrite_data_only_stripes(struct r5l_log *log,
mb->checksum = cpu_to_le32(crc32c_le(log->uuid_checksum,
mb, PAGE_SIZE));
sync_page_io(log->rdev, ctx->pos, PAGE_SIZE, page,
- REQ_OP_WRITE, REQ_FUA, false);
+ REQ_OP_WRITE, REQ_SYNC | REQ_FUA, false);
sh->log_start = ctx->pos;
list_add_tail(&sh->r5c, &log->stripe_in_journal_list);
atomic_inc(&log->stripe_in_journal_count);
@@ -2177,55 +2431,70 @@ static void r5c_recovery_flush_data_only_stripes(struct r5l_log *log,
static int r5l_recovery_log(struct r5l_log *log)
{
struct mddev *mddev = log->rdev->mddev;
- struct r5l_recovery_ctx ctx;
+ struct r5l_recovery_ctx *ctx;
int ret;
sector_t pos;
- ctx.pos = log->last_checkpoint;
- ctx.seq = log->last_cp_seq;
- ctx.meta_page = alloc_page(GFP_KERNEL);
- ctx.data_only_stripes = 0;
- ctx.data_parity_stripes = 0;
- INIT_LIST_HEAD(&ctx.cached_list);
-
- if (!ctx.meta_page)
+ ctx = kzalloc(sizeof(*ctx), GFP_KERNEL);
+ if (!ctx)
return -ENOMEM;
- ret = r5c_recovery_flush_log(log, &ctx);
- __free_page(ctx.meta_page);
+ ctx->pos = log->last_checkpoint;
+ ctx->seq = log->last_cp_seq;
+ INIT_LIST_HEAD(&ctx->cached_list);
+ ctx->meta_page = alloc_page(GFP_KERNEL);
- if (ret)
- return ret;
+ if (!ctx->meta_page) {
+ ret = -ENOMEM;
+ goto meta_page;
+ }
+
+ if (r5l_recovery_allocate_ra_pool(log, ctx) != 0) {
+ ret = -ENOMEM;
+ goto ra_pool;
+ }
- pos = ctx.pos;
- ctx.seq += 10000;
+ ret = r5c_recovery_flush_log(log, ctx);
+ if (ret)
+ goto error;
- if ((ctx.data_only_stripes == 0) && (ctx.data_parity_stripes == 0))
+ pos = ctx->pos;
+ ctx->seq += 10000;
+
+ if ((ctx->data_only_stripes == 0) && (ctx->data_parity_stripes == 0))
pr_debug("md/raid:%s: starting from clean shutdown\n",
mdname(mddev));
else
pr_debug("md/raid:%s: recovering %d data-only stripes and %d data-parity stripes\n",
- mdname(mddev), ctx.data_only_stripes,
- ctx.data_parity_stripes);
-
- if (ctx.data_only_stripes == 0) {
- log->next_checkpoint = ctx.pos;
- r5l_log_write_empty_meta_block(log, ctx.pos, ctx.seq++);
- ctx.pos = r5l_ring_add(log, ctx.pos, BLOCK_SECTORS);
- } else if (r5c_recovery_rewrite_data_only_stripes(log, &ctx)) {
+ mdname(mddev), ctx->data_only_stripes,
+ ctx->data_parity_stripes);
+
+ if (ctx->data_only_stripes == 0) {
+ log->next_checkpoint = ctx->pos;
+ r5l_log_write_empty_meta_block(log, ctx->pos, ctx->seq++);
+ ctx->pos = r5l_ring_add(log, ctx->pos, BLOCK_SECTORS);
+ } else if (r5c_recovery_rewrite_data_only_stripes(log, ctx)) {
pr_err("md/raid:%s: failed to rewrite stripes to journal\n",
mdname(mddev));
- return -EIO;
+ ret = -EIO;
+ goto error;
}
- log->log_start = ctx.pos;
- log->seq = ctx.seq;
+ log->log_start = ctx->pos;
+ log->seq = ctx->seq;
log->last_checkpoint = pos;
r5l_write_super(log, pos);
- r5c_recovery_flush_data_only_stripes(log, &ctx);
- return 0;
+ r5c_recovery_flush_data_only_stripes(log, ctx);
+ ret = 0;
+error:
+ r5l_recovery_free_ra_pool(log, ctx);
+ra_pool:
+ __free_page(ctx->meta_page);
+meta_page:
+ kfree(ctx);
+ return ret;
}
static void r5l_write_super(struct r5l_log *log, sector_t cp)
@@ -2263,40 +2532,56 @@ static ssize_t r5c_journal_mode_show(struct mddev *mddev, char *page)
return ret;
}
-static ssize_t r5c_journal_mode_store(struct mddev *mddev,
- const char *page, size_t length)
+/*
+ * Set journal cache mode on @mddev (external API initially needed by dm-raid).
+ *
+ * @mode as defined in 'enum r5c_journal_mode'.
+ *
+ */
+int r5c_journal_mode_set(struct mddev *mddev, int mode)
{
struct r5conf *conf = mddev->private;
struct r5l_log *log = conf->log;
- int val = -1, i;
- int len = length;
if (!log)
return -ENODEV;
- if (len && page[len - 1] == '\n')
- len -= 1;
- for (i = 0; i < ARRAY_SIZE(r5c_journal_mode_str); i++)
- if (strlen(r5c_journal_mode_str[i]) == len &&
- strncmp(page, r5c_journal_mode_str[i], len) == 0) {
- val = i;
- break;
- }
- if (val < R5C_JOURNAL_MODE_WRITE_THROUGH ||
- val > R5C_JOURNAL_MODE_WRITE_BACK)
+ if (mode < R5C_JOURNAL_MODE_WRITE_THROUGH ||
+ mode > R5C_JOURNAL_MODE_WRITE_BACK)
return -EINVAL;
if (raid5_calc_degraded(conf) > 0 &&
- val == R5C_JOURNAL_MODE_WRITE_BACK)
+ mode == R5C_JOURNAL_MODE_WRITE_BACK)
return -EINVAL;
mddev_suspend(mddev);
- conf->log->r5c_journal_mode = val;
+ conf->log->r5c_journal_mode = mode;
mddev_resume(mddev);
pr_debug("md/raid:%s: setting r5c cache mode to %d: %s\n",
- mdname(mddev), val, r5c_journal_mode_str[val]);
- return length;
+ mdname(mddev), mode, r5c_journal_mode_str[mode]);
+ return 0;
+}
+EXPORT_SYMBOL(r5c_journal_mode_set);
+
+static ssize_t r5c_journal_mode_store(struct mddev *mddev,
+ const char *page, size_t length)
+{
+ int mode = ARRAY_SIZE(r5c_journal_mode_str);
+ size_t len = length;
+
+ if (len < 2)
+ return -EINVAL;
+
+ if (page[len - 1] == '\n')
+ len--;
+
+ while (mode--)
+ if (strlen(r5c_journal_mode_str[mode]) == len &&
+ !strncmp(page, r5c_journal_mode_str[mode], len))
+ break;
+
+ return r5c_journal_mode_set(mddev, mode) ?: length;
}
struct md_sysfs_entry
@@ -2320,6 +2605,10 @@ int r5c_try_caching_write(struct r5conf *conf,
int i;
struct r5dev *dev;
int to_cache = 0;
+ void **pslot;
+ sector_t tree_index;
+ int ret;
+ uintptr_t refcount;
BUG_ON(!r5c_is_writeback(log));
@@ -2348,8 +2637,11 @@ int r5c_try_caching_write(struct r5conf *conf,
* When run in degraded mode, array is set to write-through mode.
* This check helps drain pending write safely in the transition to
* write-through mode.
+ *
+ * When a stripe is syncing, the write is also handled in write
+ * through mode.
*/
- if (s->failed) {
+ if (s->failed || test_bit(STRIPE_SYNCING, &sh->state)) {
r5c_make_stripe_write_out(sh);
return -EAGAIN;
}
@@ -2364,6 +2656,44 @@ int r5c_try_caching_write(struct r5conf *conf,
}
}
+ /* if the stripe is not counted in big_stripe_tree, add it now */
+ if (!test_bit(STRIPE_R5C_PARTIAL_STRIPE, &sh->state) &&
+ !test_bit(STRIPE_R5C_FULL_STRIPE, &sh->state)) {
+ tree_index = r5c_tree_index(conf, sh->sector);
+ spin_lock(&log->tree_lock);
+ pslot = radix_tree_lookup_slot(&log->big_stripe_tree,
+ tree_index);
+ if (pslot) {
+ refcount = (uintptr_t)radix_tree_deref_slot_protected(
+ pslot, &log->tree_lock) >>
+ R5C_RADIX_COUNT_SHIFT;
+ radix_tree_replace_slot(
+ &log->big_stripe_tree, pslot,
+ (void *)((refcount + 1) << R5C_RADIX_COUNT_SHIFT));
+ } else {
+ /*
+ * this radix_tree_insert can fail safely, so no
+ * need to call radix_tree_preload()
+ */
+ ret = radix_tree_insert(
+ &log->big_stripe_tree, tree_index,
+ (void *)(1 << R5C_RADIX_COUNT_SHIFT));
+ if (ret) {
+ spin_unlock(&log->tree_lock);
+ r5c_make_stripe_write_out(sh);
+ return -EAGAIN;
+ }
+ }
+ spin_unlock(&log->tree_lock);
+
+ /*
+ * set STRIPE_R5C_PARTIAL_STRIPE, this shows the stripe is
+ * counted in the radix tree
+ */
+ set_bit(STRIPE_R5C_PARTIAL_STRIPE, &sh->state);
+ atomic_inc(&conf->r5c_cached_partial_stripes);
+ }
+
for (i = disks; i--; ) {
dev = &sh->dev[i];
if (dev->towrite) {
@@ -2438,17 +2768,20 @@ void r5c_finish_stripe_write_out(struct r5conf *conf,
struct stripe_head *sh,
struct stripe_head_state *s)
{
+ struct r5l_log *log = conf->log;
int i;
int do_wakeup = 0;
+ sector_t tree_index;
+ void **pslot;
+ uintptr_t refcount;
- if (!conf->log ||
- !test_bit(R5_InJournal, &sh->dev[sh->pd_idx].flags))
+ if (!log || !test_bit(R5_InJournal, &sh->dev[sh->pd_idx].flags))
return;
WARN_ON(test_bit(STRIPE_R5C_CACHING, &sh->state));
clear_bit(R5_InJournal, &sh->dev[sh->pd_idx].flags);
- if (conf->log->r5c_journal_mode == R5C_JOURNAL_MODE_WRITE_THROUGH)
+ if (log->r5c_journal_mode == R5C_JOURNAL_MODE_WRITE_THROUGH)
return;
for (i = sh->disks; i--; ) {
@@ -2470,17 +2803,53 @@ void r5c_finish_stripe_write_out(struct r5conf *conf,
if (do_wakeup)
wake_up(&conf->wait_for_overlap);
- spin_lock_irq(&conf->log->stripe_in_journal_lock);
+ spin_lock_irq(&log->stripe_in_journal_lock);
list_del_init(&sh->r5c);
- spin_unlock_irq(&conf->log->stripe_in_journal_lock);
+ spin_unlock_irq(&log->stripe_in_journal_lock);
sh->log_start = MaxSector;
- atomic_dec(&conf->log->stripe_in_journal_count);
- r5c_update_log_state(conf->log);
+
+ atomic_dec(&log->stripe_in_journal_count);
+ r5c_update_log_state(log);
+
+ /* stop counting this stripe in big_stripe_tree */
+ if (test_bit(STRIPE_R5C_PARTIAL_STRIPE, &sh->state) ||
+ test_bit(STRIPE_R5C_FULL_STRIPE, &sh->state)) {
+ tree_index = r5c_tree_index(conf, sh->sector);
+ spin_lock(&log->tree_lock);
+ pslot = radix_tree_lookup_slot(&log->big_stripe_tree,
+ tree_index);
+ BUG_ON(pslot == NULL);
+ refcount = (uintptr_t)radix_tree_deref_slot_protected(
+ pslot, &log->tree_lock) >>
+ R5C_RADIX_COUNT_SHIFT;
+ if (refcount == 1)
+ radix_tree_delete(&log->big_stripe_tree, tree_index);
+ else
+ radix_tree_replace_slot(
+ &log->big_stripe_tree, pslot,
+ (void *)((refcount - 1) << R5C_RADIX_COUNT_SHIFT));
+ spin_unlock(&log->tree_lock);
+ }
+
+ if (test_and_clear_bit(STRIPE_R5C_PARTIAL_STRIPE, &sh->state)) {
+ BUG_ON(atomic_read(&conf->r5c_cached_partial_stripes) == 0);
+ atomic_dec(&conf->r5c_flushing_partial_stripes);
+ atomic_dec(&conf->r5c_cached_partial_stripes);
+ }
+
+ if (test_and_clear_bit(STRIPE_R5C_FULL_STRIPE, &sh->state)) {
+ BUG_ON(atomic_read(&conf->r5c_cached_full_stripes) == 0);
+ atomic_dec(&conf->r5c_flushing_full_stripes);
+ atomic_dec(&conf->r5c_cached_full_stripes);
+ }
+
+ r5l_append_flush_payload(log, sh->sector);
+ /* stripe is flused to raid disks, we can do resync now */
+ if (test_bit(STRIPE_SYNC_REQUESTED, &sh->state))
+ set_bit(STRIPE_HANDLE, &sh->state);
}
-int
-r5c_cache_data(struct r5l_log *log, struct stripe_head *sh,
- struct stripe_head_state *s)
+int r5c_cache_data(struct r5l_log *log, struct stripe_head *sh)
{
struct r5conf *conf = sh->raid_conf;
int pages = 0;
@@ -2535,6 +2904,22 @@ r5c_cache_data(struct r5l_log *log, struct stripe_head *sh,
return 0;
}
+/* check whether this big stripe is in write back cache. */
+bool r5c_big_stripe_cached(struct r5conf *conf, sector_t sect)
+{
+ struct r5l_log *log = conf->log;
+ sector_t tree_index;
+ void *slot;
+
+ if (!log)
+ return false;
+
+ WARN_ON_ONCE(!rcu_read_lock_held());
+ tree_index = r5c_tree_index(conf, sect);
+ slot = radix_tree_lookup(&log->big_stripe_tree, tree_index);
+ return slot != NULL;
+}
+
static int r5l_load_log(struct r5l_log *log)
{
struct md_rdev *rdev = log->rdev;
@@ -2610,7 +2995,7 @@ ioerr:
return ret;
}
-void r5c_update_on_rdev_error(struct mddev *mddev)
+void r5c_update_on_rdev_error(struct mddev *mddev, struct md_rdev *rdev)
{
struct r5conf *conf = mddev->private;
struct r5l_log *log = conf->log;
@@ -2618,7 +3003,8 @@ void r5c_update_on_rdev_error(struct mddev *mddev)
if (!log)
return;
- if (raid5_calc_degraded(conf) > 0 &&
+ if ((raid5_calc_degraded(conf) > 0 ||
+ test_bit(Journal, &rdev->flags)) &&
conf->log->r5c_journal_mode == R5C_JOURNAL_MODE_WRITE_BACK)
schedule_work(&log->disable_writeback_work);
}
@@ -2627,6 +3013,10 @@ int r5l_init_log(struct r5conf *conf, struct md_rdev *rdev)
{
struct request_queue *q = bdev_get_queue(rdev->bdev);
struct r5l_log *log;
+ char b[BDEVNAME_SIZE];
+
+ pr_debug("md/raid:%s: using device %s as journal\n",
+ mdname(conf->mddev), bdevname(rdev->bdev, b));
if (PAGE_SIZE != 4096)
return -EINVAL;
@@ -2673,7 +3063,7 @@ int r5l_init_log(struct r5conf *conf, struct md_rdev *rdev)
if (!log->io_pool)
goto io_pool;
- log->bs = bioset_create(R5L_POOL_SIZE, 0);
+ log->bs = bioset_create(R5L_POOL_SIZE, 0, BIOSET_NEED_BVECS);
if (!log->bs)
goto io_bs;
@@ -2681,6 +3071,9 @@ int r5l_init_log(struct r5conf *conf, struct md_rdev *rdev)
if (!log->meta_pool)
goto out_mempool;
+ spin_lock_init(&log->tree_lock);
+ INIT_RADIX_TREE(&log->big_stripe_tree, GFP_NOWAIT | __GFP_NOWARN);
+
log->reclaim_thread = md_register_thread(r5l_reclaim_thread,
log->rdev->mddev, "reclaim");
if (!log->reclaim_thread)
@@ -2726,8 +3119,13 @@ io_kc:
return -EINVAL;
}
-void r5l_exit_log(struct r5l_log *log)
+void r5l_exit_log(struct r5conf *conf)
{
+ struct r5l_log *log = conf->log;
+
+ conf->log = NULL;
+ synchronize_rcu();
+
flush_work(&log->disable_writeback_work);
md_unregister_thread(&log->reclaim_thread);
mempool_destroy(log->meta_pool);
diff --git a/drivers/md/raid5-log.h b/drivers/md/raid5-log.h
new file mode 100644
index 000000000000..328d67aedda4
--- /dev/null
+++ b/drivers/md/raid5-log.h
@@ -0,0 +1,116 @@
+#ifndef _RAID5_LOG_H
+#define _RAID5_LOG_H
+
+extern int r5l_init_log(struct r5conf *conf, struct md_rdev *rdev);
+extern void r5l_exit_log(struct r5conf *conf);
+extern int r5l_write_stripe(struct r5l_log *log, struct stripe_head *head_sh);
+extern void r5l_write_stripe_run(struct r5l_log *log);
+extern void r5l_flush_stripe_to_raid(struct r5l_log *log);
+extern void r5l_stripe_write_finished(struct stripe_head *sh);
+extern int r5l_handle_flush_request(struct r5l_log *log, struct bio *bio);
+extern void r5l_quiesce(struct r5l_log *log, int state);
+extern bool r5l_log_disk_error(struct r5conf *conf);
+extern bool r5c_is_writeback(struct r5l_log *log);
+extern int
+r5c_try_caching_write(struct r5conf *conf, struct stripe_head *sh,
+ struct stripe_head_state *s, int disks);
+extern void
+r5c_finish_stripe_write_out(struct r5conf *conf, struct stripe_head *sh,
+ struct stripe_head_state *s);
+extern void r5c_release_extra_page(struct stripe_head *sh);
+extern void r5c_use_extra_page(struct stripe_head *sh);
+extern void r5l_wake_reclaim(struct r5l_log *log, sector_t space);
+extern void r5c_handle_cached_data_endio(struct r5conf *conf,
+ struct stripe_head *sh, int disks);
+extern int r5c_cache_data(struct r5l_log *log, struct stripe_head *sh);
+extern void r5c_make_stripe_write_out(struct stripe_head *sh);
+extern void r5c_flush_cache(struct r5conf *conf, int num);
+extern void r5c_check_stripe_cache_usage(struct r5conf *conf);
+extern void r5c_check_cached_full_stripe(struct r5conf *conf);
+extern struct md_sysfs_entry r5c_journal_mode;
+extern void r5c_update_on_rdev_error(struct mddev *mddev,
+ struct md_rdev *rdev);
+extern bool r5c_big_stripe_cached(struct r5conf *conf, sector_t sect);
+
+extern struct dma_async_tx_descriptor *
+ops_run_partial_parity(struct stripe_head *sh, struct raid5_percpu *percpu,
+ struct dma_async_tx_descriptor *tx);
+extern int ppl_init_log(struct r5conf *conf);
+extern void ppl_exit_log(struct r5conf *conf);
+extern int ppl_write_stripe(struct r5conf *conf, struct stripe_head *sh);
+extern void ppl_write_stripe_run(struct r5conf *conf);
+extern void ppl_stripe_write_finished(struct stripe_head *sh);
+extern int ppl_modify_log(struct r5conf *conf, struct md_rdev *rdev, bool add);
+
+static inline bool raid5_has_ppl(struct r5conf *conf)
+{
+ return test_bit(MD_HAS_PPL, &conf->mddev->flags);
+}
+
+static inline int log_stripe(struct stripe_head *sh, struct stripe_head_state *s)
+{
+ struct r5conf *conf = sh->raid_conf;
+
+ if (conf->log) {
+ if (!test_bit(STRIPE_R5C_CACHING, &sh->state)) {
+ /* writing out phase */
+ if (s->waiting_extra_page)
+ return 0;
+ return r5l_write_stripe(conf->log, sh);
+ } else if (test_bit(STRIPE_LOG_TRAPPED, &sh->state)) {
+ /* caching phase */
+ return r5c_cache_data(conf->log, sh);
+ }
+ } else if (raid5_has_ppl(conf)) {
+ return ppl_write_stripe(conf, sh);
+ }
+
+ return -EAGAIN;
+}
+
+static inline void log_stripe_write_finished(struct stripe_head *sh)
+{
+ struct r5conf *conf = sh->raid_conf;
+
+ if (conf->log)
+ r5l_stripe_write_finished(sh);
+ else if (raid5_has_ppl(conf))
+ ppl_stripe_write_finished(sh);
+}
+
+static inline void log_write_stripe_run(struct r5conf *conf)
+{
+ if (conf->log)
+ r5l_write_stripe_run(conf->log);
+ else if (raid5_has_ppl(conf))
+ ppl_write_stripe_run(conf);
+}
+
+static inline void log_exit(struct r5conf *conf)
+{
+ if (conf->log)
+ r5l_exit_log(conf);
+ else if (raid5_has_ppl(conf))
+ ppl_exit_log(conf);
+}
+
+static inline int log_init(struct r5conf *conf, struct md_rdev *journal_dev,
+ bool ppl)
+{
+ if (journal_dev)
+ return r5l_init_log(conf, journal_dev);
+ else if (ppl)
+ return ppl_init_log(conf);
+
+ return 0;
+}
+
+static inline int log_modify(struct r5conf *conf, struct md_rdev *rdev, bool add)
+{
+ if (raid5_has_ppl(conf))
+ return ppl_modify_log(conf, rdev, add);
+
+ return 0;
+}
+
+#endif
diff --git a/drivers/md/raid5-ppl.c b/drivers/md/raid5-ppl.c
new file mode 100644
index 000000000000..44ad5baf3206
--- /dev/null
+++ b/drivers/md/raid5-ppl.c
@@ -0,0 +1,1271 @@
+/*
+ * Partial Parity Log for closing the RAID5 write hole
+ * Copyright (c) 2017, Intel Corporation.
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms and conditions of the GNU General Public License,
+ * version 2, as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope it will be useful, but WITHOUT
+ * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+ * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for
+ * more details.
+ */
+
+#include <linux/kernel.h>
+#include <linux/blkdev.h>
+#include <linux/slab.h>
+#include <linux/crc32c.h>
+#include <linux/flex_array.h>
+#include <linux/async_tx.h>
+#include <linux/raid/md_p.h>
+#include "md.h"
+#include "raid5.h"
+
+/*
+ * PPL consists of a 4KB header (struct ppl_header) and at least 128KB for
+ * partial parity data. The header contains an array of entries
+ * (struct ppl_header_entry) which describe the logged write requests.
+ * Partial parity for the entries comes after the header, written in the same
+ * sequence as the entries:
+ *
+ * Header
+ * entry0
+ * ...
+ * entryN
+ * PP data
+ * PP for entry0
+ * ...
+ * PP for entryN
+ *
+ * An entry describes one or more consecutive stripe_heads, up to a full
+ * stripe. The modifed raid data chunks form an m-by-n matrix, where m is the
+ * number of stripe_heads in the entry and n is the number of modified data
+ * disks. Every stripe_head in the entry must write to the same data disks.
+ * An example of a valid case described by a single entry (writes to the first
+ * stripe of a 4 disk array, 16k chunk size):
+ *
+ * sh->sector dd0 dd1 dd2 ppl
+ * +-----+-----+-----+
+ * 0 | --- | --- | --- | +----+
+ * 8 | -W- | -W- | --- | | pp | data_sector = 8
+ * 16 | -W- | -W- | --- | | pp | data_size = 3 * 2 * 4k
+ * 24 | -W- | -W- | --- | | pp | pp_size = 3 * 4k
+ * +-----+-----+-----+ +----+
+ *
+ * data_sector is the first raid sector of the modified data, data_size is the
+ * total size of modified data and pp_size is the size of partial parity for
+ * this entry. Entries for full stripe writes contain no partial parity
+ * (pp_size = 0), they only mark the stripes for which parity should be
+ * recalculated after an unclean shutdown. Every entry holds a checksum of its
+ * partial parity, the header also has a checksum of the header itself.
+ *
+ * A write request is always logged to the PPL instance stored on the parity
+ * disk of the corresponding stripe. For each member disk there is one ppl_log
+ * used to handle logging for this disk, independently from others. They are
+ * grouped in child_logs array in struct ppl_conf, which is assigned to
+ * r5conf->log_private.
+ *
+ * ppl_io_unit represents a full PPL write, header_page contains the ppl_header.
+ * PPL entries for logged stripes are added in ppl_log_stripe(). A stripe_head
+ * can be appended to the last entry if it meets the conditions for a valid
+ * entry described above, otherwise a new entry is added. Checksums of entries
+ * are calculated incrementally as stripes containing partial parity are being
+ * added. ppl_submit_iounit() calculates the checksum of the header and submits
+ * a bio containing the header page and partial parity pages (sh->ppl_page) for
+ * all stripes of the io_unit. When the PPL write completes, the stripes
+ * associated with the io_unit are released and raid5d starts writing their data
+ * and parity. When all stripes are written, the io_unit is freed and the next
+ * can be submitted.
+ *
+ * An io_unit is used to gather stripes until it is submitted or becomes full
+ * (if the maximum number of entries or size of PPL is reached). Another io_unit
+ * can't be submitted until the previous has completed (PPL and stripe
+ * data+parity is written). The log->io_list tracks all io_units of a log
+ * (for a single member disk). New io_units are added to the end of the list
+ * and the first io_unit is submitted, if it is not submitted already.
+ * The current io_unit accepting new stripes is always at the end of the list.
+ */
+
+struct ppl_conf {
+ struct mddev *mddev;
+
+ /* array of child logs, one for each raid disk */
+ struct ppl_log *child_logs;
+ int count;
+
+ int block_size; /* the logical block size used for data_sector
+ * in ppl_header_entry */
+ u32 signature; /* raid array identifier */
+ atomic64_t seq; /* current log write sequence number */
+
+ struct kmem_cache *io_kc;
+ mempool_t *io_pool;
+ struct bio_set *bs;
+
+ /* used only for recovery */
+ int recovered_entries;
+ int mismatch_count;
+
+ /* stripes to retry if failed to allocate io_unit */
+ struct list_head no_mem_stripes;
+ spinlock_t no_mem_stripes_lock;
+};
+
+struct ppl_log {
+ struct ppl_conf *ppl_conf; /* shared between all log instances */
+
+ struct md_rdev *rdev; /* array member disk associated with
+ * this log instance */
+ struct mutex io_mutex;
+ struct ppl_io_unit *current_io; /* current io_unit accepting new data
+ * always at the end of io_list */
+ spinlock_t io_list_lock;
+ struct list_head io_list; /* all io_units of this log */
+};
+
+#define PPL_IO_INLINE_BVECS 32
+
+struct ppl_io_unit {
+ struct ppl_log *log;
+
+ struct page *header_page; /* for ppl_header */
+
+ unsigned int entries_count; /* number of entries in ppl_header */
+ unsigned int pp_size; /* total size current of partial parity */
+
+ u64 seq; /* sequence number of this log write */
+ struct list_head log_sibling; /* log->io_list */
+
+ struct list_head stripe_list; /* stripes added to the io_unit */
+ atomic_t pending_stripes; /* how many stripes not written to raid */
+
+ bool submitted; /* true if write to log started */
+
+ /* inline bio and its biovec for submitting the iounit */
+ struct bio bio;
+ struct bio_vec biovec[PPL_IO_INLINE_BVECS];
+};
+
+struct dma_async_tx_descriptor *
+ops_run_partial_parity(struct stripe_head *sh, struct raid5_percpu *percpu,
+ struct dma_async_tx_descriptor *tx)
+{
+ int disks = sh->disks;
+ struct page **srcs = flex_array_get(percpu->scribble, 0);
+ int count = 0, pd_idx = sh->pd_idx, i;
+ struct async_submit_ctl submit;
+
+ pr_debug("%s: stripe %llu\n", __func__, (unsigned long long)sh->sector);
+
+ /*
+ * Partial parity is the XOR of stripe data chunks that are not changed
+ * during the write request. Depending on available data
+ * (read-modify-write vs. reconstruct-write case) we calculate it
+ * differently.
+ */
+ if (sh->reconstruct_state == reconstruct_state_prexor_drain_run) {
+ /*
+ * rmw: xor old data and parity from updated disks
+ * This is calculated earlier by ops_run_prexor5() so just copy
+ * the parity dev page.
+ */
+ srcs[count++] = sh->dev[pd_idx].page;
+ } else if (sh->reconstruct_state == reconstruct_state_drain_run) {
+ /* rcw: xor data from all not updated disks */
+ for (i = disks; i--;) {
+ struct r5dev *dev = &sh->dev[i];
+ if (test_bit(R5_UPTODATE, &dev->flags))
+ srcs[count++] = dev->page;
+ }
+ } else {
+ return tx;
+ }
+
+ init_async_submit(&submit, ASYNC_TX_FENCE|ASYNC_TX_XOR_ZERO_DST, tx,
+ NULL, sh, flex_array_get(percpu->scribble, 0)
+ + sizeof(struct page *) * (sh->disks + 2));
+
+ if (count == 1)
+ tx = async_memcpy(sh->ppl_page, srcs[0], 0, 0, PAGE_SIZE,
+ &submit);
+ else
+ tx = async_xor(sh->ppl_page, srcs, 0, count, PAGE_SIZE,
+ &submit);
+
+ return tx;
+}
+
+static void *ppl_io_pool_alloc(gfp_t gfp_mask, void *pool_data)
+{
+ struct kmem_cache *kc = pool_data;
+ struct ppl_io_unit *io;
+
+ io = kmem_cache_alloc(kc, gfp_mask);
+ if (!io)
+ return NULL;
+
+ io->header_page = alloc_page(gfp_mask);
+ if (!io->header_page) {
+ kmem_cache_free(kc, io);
+ return NULL;
+ }
+
+ return io;
+}
+
+static void ppl_io_pool_free(void *element, void *pool_data)
+{
+ struct kmem_cache *kc = pool_data;
+ struct ppl_io_unit *io = element;
+
+ __free_page(io->header_page);
+ kmem_cache_free(kc, io);
+}
+
+static struct ppl_io_unit *ppl_new_iounit(struct ppl_log *log,
+ struct stripe_head *sh)
+{
+ struct ppl_conf *ppl_conf = log->ppl_conf;
+ struct ppl_io_unit *io;
+ struct ppl_header *pplhdr;
+ struct page *header_page;
+
+ io = mempool_alloc(ppl_conf->io_pool, GFP_NOWAIT);
+ if (!io)
+ return NULL;
+
+ header_page = io->header_page;
+ memset(io, 0, sizeof(*io));
+ io->header_page = header_page;
+
+ io->log = log;
+ INIT_LIST_HEAD(&io->log_sibling);
+ INIT_LIST_HEAD(&io->stripe_list);
+ atomic_set(&io->pending_stripes, 0);
+ bio_init(&io->bio, io->biovec, PPL_IO_INLINE_BVECS);
+
+ pplhdr = page_address(io->header_page);
+ clear_page(pplhdr);
+ memset(pplhdr->reserved, 0xff, PPL_HDR_RESERVED);
+ pplhdr->signature = cpu_to_le32(ppl_conf->signature);
+
+ io->seq = atomic64_add_return(1, &ppl_conf->seq);
+ pplhdr->generation = cpu_to_le64(io->seq);
+
+ return io;
+}
+
+static int ppl_log_stripe(struct ppl_log *log, struct stripe_head *sh)
+{
+ struct ppl_io_unit *io = log->current_io;
+ struct ppl_header_entry *e = NULL;
+ struct ppl_header *pplhdr;
+ int i;
+ sector_t data_sector = 0;
+ int data_disks = 0;
+ unsigned int entry_space = (log->rdev->ppl.size << 9) - PPL_HEADER_SIZE;
+ struct r5conf *conf = sh->raid_conf;
+
+ pr_debug("%s: stripe: %llu\n", __func__, (unsigned long long)sh->sector);
+
+ /* check if current io_unit is full */
+ if (io && (io->pp_size == entry_space ||
+ io->entries_count == PPL_HDR_MAX_ENTRIES)) {
+ pr_debug("%s: add io_unit blocked by seq: %llu\n",
+ __func__, io->seq);
+ io = NULL;
+ }
+
+ /* add a new unit if there is none or the current is full */
+ if (!io) {
+ io = ppl_new_iounit(log, sh);
+ if (!io)
+ return -ENOMEM;
+ spin_lock_irq(&log->io_list_lock);
+ list_add_tail(&io->log_sibling, &log->io_list);
+ spin_unlock_irq(&log->io_list_lock);
+
+ log->current_io = io;
+ }
+
+ for (i = 0; i < sh->disks; i++) {
+ struct r5dev *dev = &sh->dev[i];
+
+ if (i != sh->pd_idx && test_bit(R5_Wantwrite, &dev->flags)) {
+ if (!data_disks || dev->sector < data_sector)
+ data_sector = dev->sector;
+ data_disks++;
+ }
+ }
+ BUG_ON(!data_disks);
+
+ pr_debug("%s: seq: %llu data_sector: %llu data_disks: %d\n", __func__,
+ io->seq, (unsigned long long)data_sector, data_disks);
+
+ pplhdr = page_address(io->header_page);
+
+ if (io->entries_count > 0) {
+ struct ppl_header_entry *last =
+ &pplhdr->entries[io->entries_count - 1];
+ struct stripe_head *sh_last = list_last_entry(
+ &io->stripe_list, struct stripe_head, log_list);
+ u64 data_sector_last = le64_to_cpu(last->data_sector);
+ u32 data_size_last = le32_to_cpu(last->data_size);
+
+ /*
+ * Check if we can append the stripe to the last entry. It must
+ * be just after the last logged stripe and write to the same
+ * disks. Use bit shift and logarithm to avoid 64-bit division.
+ */
+ if ((sh->sector == sh_last->sector + STRIPE_SECTORS) &&
+ (data_sector >> ilog2(conf->chunk_sectors) ==
+ data_sector_last >> ilog2(conf->chunk_sectors)) &&
+ ((data_sector - data_sector_last) * data_disks ==
+ data_size_last >> 9))
+ e = last;
+ }
+
+ if (!e) {
+ e = &pplhdr->entries[io->entries_count++];
+ e->data_sector = cpu_to_le64(data_sector);
+ e->parity_disk = cpu_to_le32(sh->pd_idx);
+ e->checksum = cpu_to_le32(~0);
+ }
+
+ le32_add_cpu(&e->data_size, data_disks << PAGE_SHIFT);
+
+ /* don't write any PP if full stripe write */
+ if (!test_bit(STRIPE_FULL_WRITE, &sh->state)) {
+ le32_add_cpu(&e->pp_size, PAGE_SIZE);
+ io->pp_size += PAGE_SIZE;
+ e->checksum = cpu_to_le32(crc32c_le(le32_to_cpu(e->checksum),
+ page_address(sh->ppl_page),
+ PAGE_SIZE));
+ }
+
+ list_add_tail(&sh->log_list, &io->stripe_list);
+ atomic_inc(&io->pending_stripes);
+ sh->ppl_io = io;
+
+ return 0;
+}
+
+int ppl_write_stripe(struct r5conf *conf, struct stripe_head *sh)
+{
+ struct ppl_conf *ppl_conf = conf->log_private;
+ struct ppl_io_unit *io = sh->ppl_io;
+ struct ppl_log *log;
+
+ if (io || test_bit(STRIPE_SYNCING, &sh->state) || !sh->ppl_page ||
+ !test_bit(R5_Wantwrite, &sh->dev[sh->pd_idx].flags) ||
+ !test_bit(R5_Insync, &sh->dev[sh->pd_idx].flags)) {
+ clear_bit(STRIPE_LOG_TRAPPED, &sh->state);
+ return -EAGAIN;
+ }
+
+ log = &ppl_conf->child_logs[sh->pd_idx];
+
+ mutex_lock(&log->io_mutex);
+
+ if (!log->rdev || test_bit(Faulty, &log->rdev->flags)) {
+ mutex_unlock(&log->io_mutex);
+ return -EAGAIN;
+ }
+
+ set_bit(STRIPE_LOG_TRAPPED, &sh->state);
+ clear_bit(STRIPE_DELAYED, &sh->state);
+ atomic_inc(&sh->count);
+
+ if (ppl_log_stripe(log, sh)) {
+ spin_lock_irq(&ppl_conf->no_mem_stripes_lock);
+ list_add_tail(&sh->log_list, &ppl_conf->no_mem_stripes);
+ spin_unlock_irq(&ppl_conf->no_mem_stripes_lock);
+ }
+
+ mutex_unlock(&log->io_mutex);
+
+ return 0;
+}
+
+static void ppl_log_endio(struct bio *bio)
+{
+ struct ppl_io_unit *io = bio->bi_private;
+ struct ppl_log *log = io->log;
+ struct ppl_conf *ppl_conf = log->ppl_conf;
+ struct stripe_head *sh, *next;
+
+ pr_debug("%s: seq: %llu\n", __func__, io->seq);
+
+ if (bio->bi_status)
+ md_error(ppl_conf->mddev, log->rdev);
+
+ list_for_each_entry_safe(sh, next, &io->stripe_list, log_list) {
+ list_del_init(&sh->log_list);
+
+ set_bit(STRIPE_HANDLE, &sh->state);
+ raid5_release_stripe(sh);
+ }
+}
+
+static void ppl_submit_iounit_bio(struct ppl_io_unit *io, struct bio *bio)
+{
+ char b[BDEVNAME_SIZE];
+
+ pr_debug("%s: seq: %llu size: %u sector: %llu dev: %s\n",
+ __func__, io->seq, bio->bi_iter.bi_size,
+ (unsigned long long)bio->bi_iter.bi_sector,
+ bdevname(bio->bi_bdev, b));
+
+ submit_bio(bio);
+}
+
+static void ppl_submit_iounit(struct ppl_io_unit *io)
+{
+ struct ppl_log *log = io->log;
+ struct ppl_conf *ppl_conf = log->ppl_conf;
+ struct ppl_header *pplhdr = page_address(io->header_page);
+ struct bio *bio = &io->bio;
+ struct stripe_head *sh;
+ int i;
+
+ bio->bi_private = io;
+
+ if (!log->rdev || test_bit(Faulty, &log->rdev->flags)) {
+ ppl_log_endio(bio);
+ return;
+ }
+
+ for (i = 0; i < io->entries_count; i++) {
+ struct ppl_header_entry *e = &pplhdr->entries[i];
+
+ pr_debug("%s: seq: %llu entry: %d data_sector: %llu pp_size: %u data_size: %u\n",
+ __func__, io->seq, i, le64_to_cpu(e->data_sector),
+ le32_to_cpu(e->pp_size), le32_to_cpu(e->data_size));
+
+ e->data_sector = cpu_to_le64(le64_to_cpu(e->data_sector) >>
+ ilog2(ppl_conf->block_size >> 9));
+ e->checksum = cpu_to_le32(~le32_to_cpu(e->checksum));
+ }
+
+ pplhdr->entries_count = cpu_to_le32(io->entries_count);
+ pplhdr->checksum = cpu_to_le32(~crc32c_le(~0, pplhdr, PPL_HEADER_SIZE));
+
+ bio->bi_end_io = ppl_log_endio;
+ bio->bi_opf = REQ_OP_WRITE | REQ_FUA;
+ bio->bi_bdev = log->rdev->bdev;
+ bio->bi_iter.bi_sector = log->rdev->ppl.sector;
+ bio_add_page(bio, io->header_page, PAGE_SIZE, 0);
+
+ list_for_each_entry(sh, &io->stripe_list, log_list) {
+ /* entries for full stripe writes have no partial parity */
+ if (test_bit(STRIPE_FULL_WRITE, &sh->state))
+ continue;
+
+ if (!bio_add_page(bio, sh->ppl_page, PAGE_SIZE, 0)) {
+ struct bio *prev = bio;
+
+ bio = bio_alloc_bioset(GFP_NOIO, BIO_MAX_PAGES,
+ ppl_conf->bs);
+ bio->bi_opf = prev->bi_opf;
+ bio->bi_bdev = prev->bi_bdev;
+ bio->bi_iter.bi_sector = bio_end_sector(prev);
+ bio_add_page(bio, sh->ppl_page, PAGE_SIZE, 0);
+
+ bio_chain(bio, prev);
+ ppl_submit_iounit_bio(io, prev);
+ }
+ }
+
+ ppl_submit_iounit_bio(io, bio);
+}
+
+static void ppl_submit_current_io(struct ppl_log *log)
+{
+ struct ppl_io_unit *io;
+
+ spin_lock_irq(&log->io_list_lock);
+
+ io = list_first_entry_or_null(&log->io_list, struct ppl_io_unit,
+ log_sibling);
+ if (io && io->submitted)
+ io = NULL;
+
+ spin_unlock_irq(&log->io_list_lock);
+
+ if (io) {
+ io->submitted = true;
+
+ if (io == log->current_io)
+ log->current_io = NULL;
+
+ ppl_submit_iounit(io);
+ }
+}
+
+void ppl_write_stripe_run(struct r5conf *conf)
+{
+ struct ppl_conf *ppl_conf = conf->log_private;
+ struct ppl_log *log;
+ int i;
+
+ for (i = 0; i < ppl_conf->count; i++) {
+ log = &ppl_conf->child_logs[i];
+
+ mutex_lock(&log->io_mutex);
+ ppl_submit_current_io(log);
+ mutex_unlock(&log->io_mutex);
+ }
+}
+
+static void ppl_io_unit_finished(struct ppl_io_unit *io)
+{
+ struct ppl_log *log = io->log;
+ struct ppl_conf *ppl_conf = log->ppl_conf;
+ unsigned long flags;
+
+ pr_debug("%s: seq: %llu\n", __func__, io->seq);
+
+ local_irq_save(flags);
+
+ spin_lock(&log->io_list_lock);
+ list_del(&io->log_sibling);
+ spin_unlock(&log->io_list_lock);
+
+ mempool_free(io, ppl_conf->io_pool);
+
+ spin_lock(&ppl_conf->no_mem_stripes_lock);
+ if (!list_empty(&ppl_conf->no_mem_stripes)) {
+ struct stripe_head *sh;
+
+ sh = list_first_entry(&ppl_conf->no_mem_stripes,
+ struct stripe_head, log_list);
+ list_del_init(&sh->log_list);
+ set_bit(STRIPE_HANDLE, &sh->state);
+ raid5_release_stripe(sh);
+ }
+ spin_unlock(&ppl_conf->no_mem_stripes_lock);
+
+ local_irq_restore(flags);
+}
+
+void ppl_stripe_write_finished(struct stripe_head *sh)
+{
+ struct ppl_io_unit *io;
+
+ io = sh->ppl_io;
+ sh->ppl_io = NULL;
+
+ if (io && atomic_dec_and_test(&io->pending_stripes))
+ ppl_io_unit_finished(io);
+}
+
+static void ppl_xor(int size, struct page *page1, struct page *page2)
+{
+ struct async_submit_ctl submit;
+ struct dma_async_tx_descriptor *tx;
+ struct page *xor_srcs[] = { page1, page2 };
+
+ init_async_submit(&submit, ASYNC_TX_ACK|ASYNC_TX_XOR_DROP_DST,
+ NULL, NULL, NULL, NULL);
+ tx = async_xor(page1, xor_srcs, 0, 2, size, &submit);
+
+ async_tx_quiesce(&tx);
+}
+
+/*
+ * PPL recovery strategy: xor partial parity and data from all modified data
+ * disks within a stripe and write the result as the new stripe parity. If all
+ * stripe data disks are modified (full stripe write), no partial parity is
+ * available, so just xor the data disks.
+ *
+ * Recovery of a PPL entry shall occur only if all modified data disks are
+ * available and read from all of them succeeds.
+ *
+ * A PPL entry applies to a stripe, partial parity size for an entry is at most
+ * the size of the chunk. Examples of possible cases for a single entry:
+ *
+ * case 0: single data disk write:
+ * data0 data1 data2 ppl parity
+ * +--------+--------+--------+ +--------------------+
+ * | ------ | ------ | ------ | +----+ | (no change) |
+ * | ------ | -data- | ------ | | pp | -> | data1 ^ pp |
+ * | ------ | -data- | ------ | | pp | -> | data1 ^ pp |
+ * | ------ | ------ | ------ | +----+ | (no change) |
+ * +--------+--------+--------+ +--------------------+
+ * pp_size = data_size
+ *
+ * case 1: more than one data disk write:
+ * data0 data1 data2 ppl parity
+ * +--------+--------+--------+ +--------------------+
+ * | ------ | ------ | ------ | +----+ | (no change) |
+ * | -data- | -data- | ------ | | pp | -> | data0 ^ data1 ^ pp |
+ * | -data- | -data- | ------ | | pp | -> | data0 ^ data1 ^ pp |
+ * | ------ | ------ | ------ | +----+ | (no change) |
+ * +--------+--------+--------+ +--------------------+
+ * pp_size = data_size / modified_data_disks
+ *
+ * case 2: write to all data disks (also full stripe write):
+ * data0 data1 data2 parity
+ * +--------+--------+--------+ +--------------------+
+ * | ------ | ------ | ------ | | (no change) |
+ * | -data- | -data- | -data- | --------> | xor all data |
+ * | ------ | ------ | ------ | --------> | (no change) |
+ * | ------ | ------ | ------ | | (no change) |
+ * +--------+--------+--------+ +--------------------+
+ * pp_size = 0
+ *
+ * The following cases are possible only in other implementations. The recovery
+ * code can handle them, but they are not generated at runtime because they can
+ * be reduced to cases 0, 1 and 2:
+ *
+ * case 3:
+ * data0 data1 data2 ppl parity
+ * +--------+--------+--------+ +----+ +--------------------+
+ * | ------ | -data- | -data- | | pp | | data1 ^ data2 ^ pp |
+ * | ------ | -data- | -data- | | pp | -> | data1 ^ data2 ^ pp |
+ * | -data- | -data- | -data- | | -- | -> | xor all data |
+ * | -data- | -data- | ------ | | pp | | data0 ^ data1 ^ pp |
+ * +--------+--------+--------+ +----+ +--------------------+
+ * pp_size = chunk_size
+ *
+ * case 4:
+ * data0 data1 data2 ppl parity
+ * +--------+--------+--------+ +----+ +--------------------+
+ * | ------ | -data- | ------ | | pp | | data1 ^ pp |
+ * | ------ | ------ | ------ | | -- | -> | (no change) |
+ * | ------ | ------ | ------ | | -- | -> | (no change) |
+ * | -data- | ------ | ------ | | pp | | data0 ^ pp |
+ * +--------+--------+--------+ +----+ +--------------------+
+ * pp_size = chunk_size
+ */
+static int ppl_recover_entry(struct ppl_log *log, struct ppl_header_entry *e,
+ sector_t ppl_sector)
+{
+ struct ppl_conf *ppl_conf = log->ppl_conf;
+ struct mddev *mddev = ppl_conf->mddev;
+ struct r5conf *conf = mddev->private;
+ int block_size = ppl_conf->block_size;
+ struct page *page1;
+ struct page *page2;
+ sector_t r_sector_first;
+ sector_t r_sector_last;
+ int strip_sectors;
+ int data_disks;
+ int i;
+ int ret = 0;
+ char b[BDEVNAME_SIZE];
+ unsigned int pp_size = le32_to_cpu(e->pp_size);
+ unsigned int data_size = le32_to_cpu(e->data_size);
+
+ page1 = alloc_page(GFP_KERNEL);
+ page2 = alloc_page(GFP_KERNEL);
+
+ if (!page1 || !page2) {
+ ret = -ENOMEM;
+ goto out;
+ }
+
+ r_sector_first = le64_to_cpu(e->data_sector) * (block_size >> 9);
+
+ if ((pp_size >> 9) < conf->chunk_sectors) {
+ if (pp_size > 0) {
+ data_disks = data_size / pp_size;
+ strip_sectors = pp_size >> 9;
+ } else {
+ data_disks = conf->raid_disks - conf->max_degraded;
+ strip_sectors = (data_size >> 9) / data_disks;
+ }
+ r_sector_last = r_sector_first +
+ (data_disks - 1) * conf->chunk_sectors +
+ strip_sectors;
+ } else {
+ data_disks = conf->raid_disks - conf->max_degraded;
+ strip_sectors = conf->chunk_sectors;
+ r_sector_last = r_sector_first + (data_size >> 9);
+ }
+
+ pr_debug("%s: array sector first: %llu last: %llu\n", __func__,
+ (unsigned long long)r_sector_first,
+ (unsigned long long)r_sector_last);
+
+ /* if start and end is 4k aligned, use a 4k block */
+ if (block_size == 512 &&
+ (r_sector_first & (STRIPE_SECTORS - 1)) == 0 &&
+ (r_sector_last & (STRIPE_SECTORS - 1)) == 0)
+ block_size = STRIPE_SIZE;
+
+ /* iterate through blocks in strip */
+ for (i = 0; i < strip_sectors; i += (block_size >> 9)) {
+ bool update_parity = false;
+ sector_t parity_sector;
+ struct md_rdev *parity_rdev;
+ struct stripe_head sh;
+ int disk;
+ int indent = 0;
+
+ pr_debug("%s:%*s iter %d start\n", __func__, indent, "", i);
+ indent += 2;
+
+ memset(page_address(page1), 0, PAGE_SIZE);
+
+ /* iterate through data member disks */
+ for (disk = 0; disk < data_disks; disk++) {
+ int dd_idx;
+ struct md_rdev *rdev;
+ sector_t sector;
+ sector_t r_sector = r_sector_first + i +
+ (disk * conf->chunk_sectors);
+
+ pr_debug("%s:%*s data member disk %d start\n",
+ __func__, indent, "", disk);
+ indent += 2;
+
+ if (r_sector >= r_sector_last) {
+ pr_debug("%s:%*s array sector %llu doesn't need parity update\n",
+ __func__, indent, "",
+ (unsigned long long)r_sector);
+ indent -= 2;
+ continue;
+ }
+
+ update_parity = true;
+
+ /* map raid sector to member disk */
+ sector = raid5_compute_sector(conf, r_sector, 0,
+ &dd_idx, NULL);
+ pr_debug("%s:%*s processing array sector %llu => data member disk %d, sector %llu\n",
+ __func__, indent, "",
+ (unsigned long long)r_sector, dd_idx,
+ (unsigned long long)sector);
+
+ rdev = conf->disks[dd_idx].rdev;
+ if (!rdev) {
+ pr_debug("%s:%*s data member disk %d missing\n",
+ __func__, indent, "", dd_idx);
+ update_parity = false;
+ break;
+ }
+
+ pr_debug("%s:%*s reading data member disk %s sector %llu\n",
+ __func__, indent, "", bdevname(rdev->bdev, b),
+ (unsigned long long)sector);
+ if (!sync_page_io(rdev, sector, block_size, page2,
+ REQ_OP_READ, 0, false)) {
+ md_error(mddev, rdev);
+ pr_debug("%s:%*s read failed!\n", __func__,
+ indent, "");
+ ret = -EIO;
+ goto out;
+ }
+
+ ppl_xor(block_size, page1, page2);
+
+ indent -= 2;
+ }
+
+ if (!update_parity)
+ continue;
+
+ if (pp_size > 0) {
+ pr_debug("%s:%*s reading pp disk sector %llu\n",
+ __func__, indent, "",
+ (unsigned long long)(ppl_sector + i));
+ if (!sync_page_io(log->rdev,
+ ppl_sector - log->rdev->data_offset + i,
+ block_size, page2, REQ_OP_READ, 0,
+ false)) {
+ pr_debug("%s:%*s read failed!\n", __func__,
+ indent, "");
+ md_error(mddev, log->rdev);
+ ret = -EIO;
+ goto out;
+ }
+
+ ppl_xor(block_size, page1, page2);
+ }
+
+ /* map raid sector to parity disk */
+ parity_sector = raid5_compute_sector(conf, r_sector_first + i,
+ 0, &disk, &sh);
+ BUG_ON(sh.pd_idx != le32_to_cpu(e->parity_disk));
+ parity_rdev = conf->disks[sh.pd_idx].rdev;
+
+ BUG_ON(parity_rdev->bdev->bd_dev != log->rdev->bdev->bd_dev);
+ pr_debug("%s:%*s write parity at sector %llu, disk %s\n",
+ __func__, indent, "",
+ (unsigned long long)parity_sector,
+ bdevname(parity_rdev->bdev, b));
+ if (!sync_page_io(parity_rdev, parity_sector, block_size,
+ page1, REQ_OP_WRITE, 0, false)) {
+ pr_debug("%s:%*s parity write error!\n", __func__,
+ indent, "");
+ md_error(mddev, parity_rdev);
+ ret = -EIO;
+ goto out;
+ }
+ }
+out:
+ if (page1)
+ __free_page(page1);
+ if (page2)
+ __free_page(page2);
+ return ret;
+}
+
+static int ppl_recover(struct ppl_log *log, struct ppl_header *pplhdr)
+{
+ struct ppl_conf *ppl_conf = log->ppl_conf;
+ struct md_rdev *rdev = log->rdev;
+ struct mddev *mddev = rdev->mddev;
+ sector_t ppl_sector = rdev->ppl.sector + (PPL_HEADER_SIZE >> 9);
+ struct page *page;
+ int i;
+ int ret = 0;
+
+ page = alloc_page(GFP_KERNEL);
+ if (!page)
+ return -ENOMEM;
+
+ /* iterate through all PPL entries saved */
+ for (i = 0; i < le32_to_cpu(pplhdr->entries_count); i++) {
+ struct ppl_header_entry *e = &pplhdr->entries[i];
+ u32 pp_size = le32_to_cpu(e->pp_size);
+ sector_t sector = ppl_sector;
+ int ppl_entry_sectors = pp_size >> 9;
+ u32 crc, crc_stored;
+
+ pr_debug("%s: disk: %d entry: %d ppl_sector: %llu pp_size: %u\n",
+ __func__, rdev->raid_disk, i,
+ (unsigned long long)ppl_sector, pp_size);
+
+ crc = ~0;
+ crc_stored = le32_to_cpu(e->checksum);
+
+ /* read parial parity for this entry and calculate its checksum */
+ while (pp_size) {
+ int s = pp_size > PAGE_SIZE ? PAGE_SIZE : pp_size;
+
+ if (!sync_page_io(rdev, sector - rdev->data_offset,
+ s, page, REQ_OP_READ, 0, false)) {
+ md_error(mddev, rdev);
+ ret = -EIO;
+ goto out;
+ }
+
+ crc = crc32c_le(crc, page_address(page), s);
+
+ pp_size -= s;
+ sector += s >> 9;
+ }
+
+ crc = ~crc;
+
+ if (crc != crc_stored) {
+ /*
+ * Don't recover this entry if the checksum does not
+ * match, but keep going and try to recover other
+ * entries.
+ */
+ pr_debug("%s: ppl entry crc does not match: stored: 0x%x calculated: 0x%x\n",
+ __func__, crc_stored, crc);
+ ppl_conf->mismatch_count++;
+ } else {
+ ret = ppl_recover_entry(log, e, ppl_sector);
+ if (ret)
+ goto out;
+ ppl_conf->recovered_entries++;
+ }
+
+ ppl_sector += ppl_entry_sectors;
+ }
+
+ /* flush the disk cache after recovery if necessary */
+ ret = blkdev_issue_flush(rdev->bdev, GFP_KERNEL, NULL);
+out:
+ __free_page(page);
+ return ret;
+}
+
+static int ppl_write_empty_header(struct ppl_log *log)
+{
+ struct page *page;
+ struct ppl_header *pplhdr;
+ struct md_rdev *rdev = log->rdev;
+ int ret = 0;
+
+ pr_debug("%s: disk: %d ppl_sector: %llu\n", __func__,
+ rdev->raid_disk, (unsigned long long)rdev->ppl.sector);
+
+ page = alloc_page(GFP_NOIO | __GFP_ZERO);
+ if (!page)
+ return -ENOMEM;
+
+ pplhdr = page_address(page);
+ memset(pplhdr->reserved, 0xff, PPL_HDR_RESERVED);
+ pplhdr->signature = cpu_to_le32(log->ppl_conf->signature);
+ pplhdr->checksum = cpu_to_le32(~crc32c_le(~0, pplhdr, PAGE_SIZE));
+
+ if (!sync_page_io(rdev, rdev->ppl.sector - rdev->data_offset,
+ PPL_HEADER_SIZE, page, REQ_OP_WRITE | REQ_SYNC |
+ REQ_FUA, 0, false)) {
+ md_error(rdev->mddev, rdev);
+ ret = -EIO;
+ }
+
+ __free_page(page);
+ return ret;
+}
+
+static int ppl_load_distributed(struct ppl_log *log)
+{
+ struct ppl_conf *ppl_conf = log->ppl_conf;
+ struct md_rdev *rdev = log->rdev;
+ struct mddev *mddev = rdev->mddev;
+ struct page *page;
+ struct ppl_header *pplhdr;
+ u32 crc, crc_stored;
+ u32 signature;
+ int ret = 0;
+
+ pr_debug("%s: disk: %d\n", __func__, rdev->raid_disk);
+
+ /* read PPL header */
+ page = alloc_page(GFP_KERNEL);
+ if (!page)
+ return -ENOMEM;
+
+ if (!sync_page_io(rdev, rdev->ppl.sector - rdev->data_offset,
+ PAGE_SIZE, page, REQ_OP_READ, 0, false)) {
+ md_error(mddev, rdev);
+ ret = -EIO;
+ goto out;
+ }
+ pplhdr = page_address(page);
+
+ /* check header validity */
+ crc_stored = le32_to_cpu(pplhdr->checksum);
+ pplhdr->checksum = 0;
+ crc = ~crc32c_le(~0, pplhdr, PAGE_SIZE);
+
+ if (crc_stored != crc) {
+ pr_debug("%s: ppl header crc does not match: stored: 0x%x calculated: 0x%x\n",
+ __func__, crc_stored, crc);
+ ppl_conf->mismatch_count++;
+ goto out;
+ }
+
+ signature = le32_to_cpu(pplhdr->signature);
+
+ if (mddev->external) {
+ /*
+ * For external metadata the header signature is set and
+ * validated in userspace.
+ */
+ ppl_conf->signature = signature;
+ } else if (ppl_conf->signature != signature) {
+ pr_debug("%s: ppl header signature does not match: stored: 0x%x configured: 0x%x\n",
+ __func__, signature, ppl_conf->signature);
+ ppl_conf->mismatch_count++;
+ goto out;
+ }
+
+ /* attempt to recover from log if we are starting a dirty array */
+ if (!mddev->pers && mddev->recovery_cp != MaxSector)
+ ret = ppl_recover(log, pplhdr);
+out:
+ /* write empty header if we are starting the array */
+ if (!ret && !mddev->pers)
+ ret = ppl_write_empty_header(log);
+
+ __free_page(page);
+
+ pr_debug("%s: return: %d mismatch_count: %d recovered_entries: %d\n",
+ __func__, ret, ppl_conf->mismatch_count,
+ ppl_conf->recovered_entries);
+ return ret;
+}
+
+static int ppl_load(struct ppl_conf *ppl_conf)
+{
+ int ret = 0;
+ u32 signature = 0;
+ bool signature_set = false;
+ int i;
+
+ for (i = 0; i < ppl_conf->count; i++) {
+ struct ppl_log *log = &ppl_conf->child_logs[i];
+
+ /* skip missing drive */
+ if (!log->rdev)
+ continue;
+
+ ret = ppl_load_distributed(log);
+ if (ret)
+ break;
+
+ /*
+ * For external metadata we can't check if the signature is
+ * correct on a single drive, but we can check if it is the same
+ * on all drives.
+ */
+ if (ppl_conf->mddev->external) {
+ if (!signature_set) {
+ signature = ppl_conf->signature;
+ signature_set = true;
+ } else if (signature != ppl_conf->signature) {
+ pr_warn("md/raid:%s: PPL header signature does not match on all member drives\n",
+ mdname(ppl_conf->mddev));
+ ret = -EINVAL;
+ break;
+ }
+ }
+ }
+
+ pr_debug("%s: return: %d mismatch_count: %d recovered_entries: %d\n",
+ __func__, ret, ppl_conf->mismatch_count,
+ ppl_conf->recovered_entries);
+ return ret;
+}
+
+static void __ppl_exit_log(struct ppl_conf *ppl_conf)
+{
+ clear_bit(MD_HAS_PPL, &ppl_conf->mddev->flags);
+
+ kfree(ppl_conf->child_logs);
+
+ if (ppl_conf->bs)
+ bioset_free(ppl_conf->bs);
+ mempool_destroy(ppl_conf->io_pool);
+ kmem_cache_destroy(ppl_conf->io_kc);
+
+ kfree(ppl_conf);
+}
+
+void ppl_exit_log(struct r5conf *conf)
+{
+ struct ppl_conf *ppl_conf = conf->log_private;
+
+ if (ppl_conf) {
+ __ppl_exit_log(ppl_conf);
+ conf->log_private = NULL;
+ }
+}
+
+static int ppl_validate_rdev(struct md_rdev *rdev)
+{
+ char b[BDEVNAME_SIZE];
+ int ppl_data_sectors;
+ int ppl_size_new;
+
+ /*
+ * The configured PPL size must be enough to store
+ * the header and (at the very least) partial parity
+ * for one stripe. Round it down to ensure the data
+ * space is cleanly divisible by stripe size.
+ */
+ ppl_data_sectors = rdev->ppl.size - (PPL_HEADER_SIZE >> 9);
+
+ if (ppl_data_sectors > 0)
+ ppl_data_sectors = rounddown(ppl_data_sectors, STRIPE_SECTORS);
+
+ if (ppl_data_sectors <= 0) {
+ pr_warn("md/raid:%s: PPL space too small on %s\n",
+ mdname(rdev->mddev), bdevname(rdev->bdev, b));
+ return -ENOSPC;
+ }
+
+ ppl_size_new = ppl_data_sectors + (PPL_HEADER_SIZE >> 9);
+
+ if ((rdev->ppl.sector < rdev->data_offset &&
+ rdev->ppl.sector + ppl_size_new > rdev->data_offset) ||
+ (rdev->ppl.sector >= rdev->data_offset &&
+ rdev->data_offset + rdev->sectors > rdev->ppl.sector)) {
+ pr_warn("md/raid:%s: PPL space overlaps with data on %s\n",
+ mdname(rdev->mddev), bdevname(rdev->bdev, b));
+ return -EINVAL;
+ }
+
+ if (!rdev->mddev->external &&
+ ((rdev->ppl.offset > 0 && rdev->ppl.offset < (rdev->sb_size >> 9)) ||
+ (rdev->ppl.offset <= 0 && rdev->ppl.offset + ppl_size_new > 0))) {
+ pr_warn("md/raid:%s: PPL space overlaps with superblock on %s\n",
+ mdname(rdev->mddev), bdevname(rdev->bdev, b));
+ return -EINVAL;
+ }
+
+ rdev->ppl.size = ppl_size_new;
+
+ return 0;
+}
+
+int ppl_init_log(struct r5conf *conf)
+{
+ struct ppl_conf *ppl_conf;
+ struct mddev *mddev = conf->mddev;
+ int ret = 0;
+ int i;
+ bool need_cache_flush = false;
+
+ pr_debug("md/raid:%s: enabling distributed Partial Parity Log\n",
+ mdname(conf->mddev));
+
+ if (PAGE_SIZE != 4096)
+ return -EINVAL;
+
+ if (mddev->level != 5) {
+ pr_warn("md/raid:%s PPL is not compatible with raid level %d\n",
+ mdname(mddev), mddev->level);
+ return -EINVAL;
+ }
+
+ if (mddev->bitmap_info.file || mddev->bitmap_info.offset) {
+ pr_warn("md/raid:%s PPL is not compatible with bitmap\n",
+ mdname(mddev));
+ return -EINVAL;
+ }
+
+ if (test_bit(MD_HAS_JOURNAL, &mddev->flags)) {
+ pr_warn("md/raid:%s PPL is not compatible with journal\n",
+ mdname(mddev));
+ return -EINVAL;
+ }
+
+ ppl_conf = kzalloc(sizeof(struct ppl_conf), GFP_KERNEL);
+ if (!ppl_conf)
+ return -ENOMEM;
+
+ ppl_conf->mddev = mddev;
+
+ ppl_conf->io_kc = KMEM_CACHE(ppl_io_unit, 0);
+ if (!ppl_conf->io_kc) {
+ ret = -ENOMEM;
+ goto err;
+ }
+
+ ppl_conf->io_pool = mempool_create(conf->raid_disks, ppl_io_pool_alloc,
+ ppl_io_pool_free, ppl_conf->io_kc);
+ if (!ppl_conf->io_pool) {
+ ret = -ENOMEM;
+ goto err;
+ }
+
+ ppl_conf->bs = bioset_create(conf->raid_disks, 0, BIOSET_NEED_BVECS);
+ if (!ppl_conf->bs) {
+ ret = -ENOMEM;
+ goto err;
+ }
+
+ ppl_conf->count = conf->raid_disks;
+ ppl_conf->child_logs = kcalloc(ppl_conf->count, sizeof(struct ppl_log),
+ GFP_KERNEL);
+ if (!ppl_conf->child_logs) {
+ ret = -ENOMEM;
+ goto err;
+ }
+
+ atomic64_set(&ppl_conf->seq, 0);
+ INIT_LIST_HEAD(&ppl_conf->no_mem_stripes);
+ spin_lock_init(&ppl_conf->no_mem_stripes_lock);
+
+ if (!mddev->external) {
+ ppl_conf->signature = ~crc32c_le(~0, mddev->uuid, sizeof(mddev->uuid));
+ ppl_conf->block_size = 512;
+ } else {
+ ppl_conf->block_size = queue_logical_block_size(mddev->queue);
+ }
+
+ for (i = 0; i < ppl_conf->count; i++) {
+ struct ppl_log *log = &ppl_conf->child_logs[i];
+ struct md_rdev *rdev = conf->disks[i].rdev;
+
+ mutex_init(&log->io_mutex);
+ spin_lock_init(&log->io_list_lock);
+ INIT_LIST_HEAD(&log->io_list);
+
+ log->ppl_conf = ppl_conf;
+ log->rdev = rdev;
+
+ if (rdev) {
+ struct request_queue *q;
+
+ ret = ppl_validate_rdev(rdev);
+ if (ret)
+ goto err;
+
+ q = bdev_get_queue(rdev->bdev);
+ if (test_bit(QUEUE_FLAG_WC, &q->queue_flags))
+ need_cache_flush = true;
+ }
+ }
+
+ if (need_cache_flush)
+ pr_warn("md/raid:%s: Volatile write-back cache should be disabled on all member drives when using PPL!\n",
+ mdname(mddev));
+
+ /* load and possibly recover the logs from the member disks */
+ ret = ppl_load(ppl_conf);
+
+ if (ret) {
+ goto err;
+ } else if (!mddev->pers &&
+ mddev->recovery_cp == 0 && !mddev->degraded &&
+ ppl_conf->recovered_entries > 0 &&
+ ppl_conf->mismatch_count == 0) {
+ /*
+ * If we are starting a dirty array and the recovery succeeds
+ * without any issues, set the array as clean.
+ */
+ mddev->recovery_cp = MaxSector;
+ set_bit(MD_SB_CHANGE_CLEAN, &mddev->sb_flags);
+ } else if (mddev->pers && ppl_conf->mismatch_count > 0) {
+ /* no mismatch allowed when enabling PPL for a running array */
+ ret = -EINVAL;
+ goto err;
+ }
+
+ conf->log_private = ppl_conf;
+ set_bit(MD_HAS_PPL, &ppl_conf->mddev->flags);
+
+ return 0;
+err:
+ __ppl_exit_log(ppl_conf);
+ return ret;
+}
+
+int ppl_modify_log(struct r5conf *conf, struct md_rdev *rdev, bool add)
+{
+ struct ppl_conf *ppl_conf = conf->log_private;
+ struct ppl_log *log;
+ int ret = 0;
+ char b[BDEVNAME_SIZE];
+
+ if (!rdev)
+ return -EINVAL;
+
+ pr_debug("%s: disk: %d operation: %s dev: %s\n",
+ __func__, rdev->raid_disk, add ? "add" : "remove",
+ bdevname(rdev->bdev, b));
+
+ if (rdev->raid_disk < 0)
+ return 0;
+
+ if (rdev->raid_disk >= ppl_conf->count)
+ return -ENODEV;
+
+ log = &ppl_conf->child_logs[rdev->raid_disk];
+
+ mutex_lock(&log->io_mutex);
+ if (add) {
+ ret = ppl_validate_rdev(rdev);
+ if (!ret) {
+ log->rdev = rdev;
+ ret = ppl_write_empty_header(log);
+ }
+ } else {
+ log->rdev = NULL;
+ }
+ mutex_unlock(&log->io_mutex);
+
+ return ret;
+}
diff --git a/drivers/md/raid5.c b/drivers/md/raid5.c
index 3c7e106c12a2..aeeb8d6854e2 100644
--- a/drivers/md/raid5.c
+++ b/drivers/md/raid5.c
@@ -55,12 +55,16 @@
#include <linux/ratelimit.h>
#include <linux/nodemask.h>
#include <linux/flex_array.h>
+#include <linux/sched/signal.h>
+
#include <trace/events/block.h>
+#include <linux/list_sort.h>
#include "md.h"
#include "raid5.h"
#include "raid0.h"
#include "bitmap.h"
+#include "raid5-log.h"
#define UNSUPPORTED_MDDEV_FLAGS (1L << MD_FAILFAST_SUPPORTED)
@@ -99,8 +103,7 @@ static inline void unlock_device_hash_lock(struct r5conf *conf, int hash)
static inline void lock_all_device_hash_locks_irq(struct r5conf *conf)
{
int i;
- local_irq_disable();
- spin_lock(conf->hash_locks);
+ spin_lock_irq(conf->hash_locks);
for (i = 1; i < NR_STRIPE_HASH_LOCKS; i++)
spin_lock_nest_lock(conf->hash_locks + i, conf->hash_locks);
spin_lock(&conf->device_lock);
@@ -110,9 +113,9 @@ static inline void unlock_all_device_hash_locks_irq(struct r5conf *conf)
{
int i;
spin_unlock(&conf->device_lock);
- for (i = NR_STRIPE_HASH_LOCKS; i; i--)
- spin_unlock(conf->hash_locks + i - 1);
- local_irq_enable();
+ for (i = NR_STRIPE_HASH_LOCKS - 1; i; i--)
+ spin_unlock(conf->hash_locks + i);
+ spin_unlock_irq(conf->hash_locks);
}
/* Find first data disk in a raid6 stripe */
@@ -154,17 +157,6 @@ static int raid6_idx_to_slot(int idx, struct stripe_head *sh,
return slot;
}
-static void return_io(struct bio_list *return_bi)
-{
- struct bio *bi;
- while ((bi = bio_list_pop(return_bi)) != NULL) {
- bi->bi_iter.bi_size = 0;
- trace_block_bio_complete(bdev_get_queue(bi->bi_bdev),
- bi, 0);
- bio_endio(bi);
- }
-}
-
static void print_raid5_conf (struct r5conf *conf);
static int stripe_operations_active(struct stripe_head *sh)
@@ -174,6 +166,13 @@ static int stripe_operations_active(struct stripe_head *sh)
test_bit(STRIPE_COMPUTE_RUN, &sh->state);
}
+static bool stripe_is_lowprio(struct stripe_head *sh)
+{
+ return (test_bit(STRIPE_R5C_FULL_STRIPE, &sh->state) ||
+ test_bit(STRIPE_R5C_PARTIAL_STRIPE, &sh->state)) &&
+ !test_bit(STRIPE_R5C_CACHING, &sh->state);
+}
+
static void raid5_wakeup_stripe_thread(struct stripe_head *sh)
{
struct r5conf *conf = sh->raid_conf;
@@ -189,7 +188,10 @@ static void raid5_wakeup_stripe_thread(struct stripe_head *sh)
if (list_empty(&sh->lru)) {
struct r5worker_group *group;
group = conf->worker_groups + cpu_to_group(cpu);
- list_add_tail(&sh->lru, &group->handle_list);
+ if (stripe_is_lowprio(sh))
+ list_add_tail(&sh->lru, &group->loprio_list);
+ else
+ list_add_tail(&sh->lru, &group->handle_list);
group->stripes_cnt++;
sh->group = group;
}
@@ -231,11 +233,15 @@ static void do_release_stripe(struct r5conf *conf, struct stripe_head *sh,
if (test_bit(R5_InJournal, &sh->dev[i].flags))
injournal++;
/*
- * When quiesce in r5c write back, set STRIPE_HANDLE for stripes with
- * data in journal, so they are not released to cached lists
+ * In the following cases, the stripe cannot be released to cached
+ * lists. Therefore, we make the stripe write out and set
+ * STRIPE_HANDLE:
+ * 1. when quiesce in r5c write back;
+ * 2. when resync is requested fot the stripe.
*/
- if (conf->quiesce && r5c_is_writeback(conf->log) &&
- !test_bit(STRIPE_HANDLE, &sh->state) && injournal != 0) {
+ if (test_bit(STRIPE_SYNC_REQUESTED, &sh->state) ||
+ (conf->quiesce && r5c_is_writeback(conf->log) &&
+ !test_bit(STRIPE_HANDLE, &sh->state) && injournal != 0)) {
if (test_bit(STRIPE_R5C_CACHING, &sh->state))
r5c_make_stripe_write_out(sh);
set_bit(STRIPE_HANDLE, &sh->state);
@@ -252,7 +258,12 @@ static void do_release_stripe(struct r5conf *conf, struct stripe_head *sh,
clear_bit(STRIPE_DELAYED, &sh->state);
clear_bit(STRIPE_BIT_DELAY, &sh->state);
if (conf->worker_cnt_per_group == 0) {
- list_add_tail(&sh->lru, &conf->handle_list);
+ if (stripe_is_lowprio(sh))
+ list_add_tail(&sh->lru,
+ &conf->loprio_list);
+ else
+ list_add_tail(&sh->lru,
+ &conf->handle_list);
} else {
raid5_wakeup_stripe_thread(sh);
return;
@@ -281,13 +292,13 @@ static void do_release_stripe(struct r5conf *conf, struct stripe_head *sh,
atomic_dec(&conf->r5c_cached_partial_stripes);
list_add_tail(&sh->lru, &conf->r5c_full_stripe_list);
r5c_check_cached_full_stripe(conf);
- } else {
- /* partial stripe */
- if (!test_and_set_bit(STRIPE_R5C_PARTIAL_STRIPE,
- &sh->state))
- atomic_inc(&conf->r5c_cached_partial_stripes);
+ } else
+ /*
+ * STRIPE_R5C_PARTIAL_STRIPE is set in
+ * r5c_try_caching_write(). No need to
+ * set it again.
+ */
list_add_tail(&sh->lru, &conf->r5c_partial_stripe_list);
- }
}
}
}
@@ -353,17 +364,15 @@ static void release_inactive_stripe_list(struct r5conf *conf,
static int release_stripe_list(struct r5conf *conf,
struct list_head *temp_inactive_list)
{
- struct stripe_head *sh;
+ struct stripe_head *sh, *t;
int count = 0;
struct llist_node *head;
head = llist_del_all(&conf->released_stripes);
head = llist_reverse_order(head);
- while (head) {
+ llist_for_each_entry_safe(sh, t, head, release_list) {
int hash;
- sh = llist_entry(head, struct stripe_head, release_list);
- head = llist_next(head);
/* sh could be readded after STRIPE_ON_RELEASE_LIST is cleard */
smp_mb();
clear_bit(STRIPE_ON_RELEASE_LIST, &sh->state);
@@ -481,6 +490,7 @@ static int grow_buffers(struct stripe_head *sh, gfp_t gfp)
sh->dev[i].page = page;
sh->dev[i].orig_page = page;
}
+
return 0;
}
@@ -707,12 +717,11 @@ static bool is_full_stripe_write(struct stripe_head *sh)
static void lock_two_stripes(struct stripe_head *sh1, struct stripe_head *sh2)
{
- local_irq_disable();
if (sh1 > sh2) {
- spin_lock(&sh2->stripe_lock);
+ spin_lock_irq(&sh2->stripe_lock);
spin_lock_nested(&sh1->stripe_lock, 1);
} else {
- spin_lock(&sh1->stripe_lock);
+ spin_lock_irq(&sh1->stripe_lock);
spin_lock_nested(&sh2->stripe_lock, 1);
}
}
@@ -720,8 +729,7 @@ static void lock_two_stripes(struct stripe_head *sh1, struct stripe_head *sh2)
static void unlock_two_stripes(struct stripe_head *sh1, struct stripe_head *sh2)
{
spin_unlock(&sh1->stripe_lock);
- spin_unlock(&sh2->stripe_lock);
- local_irq_enable();
+ spin_unlock_irq(&sh2->stripe_lock);
}
/* Only freshly new full stripe normal write stripe can be added to a batch list */
@@ -729,7 +737,7 @@ static bool stripe_can_batch(struct stripe_head *sh)
{
struct r5conf *conf = sh->raid_conf;
- if (conf->log)
+ if (conf->log || raid5_has_ppl(conf))
return false;
return test_bit(STRIPE_BATCH_READY, &sh->state) &&
!test_bit(STRIPE_BITMAP_PENDING, &sh->state) &&
@@ -863,6 +871,109 @@ static int use_new_offset(struct r5conf *conf, struct stripe_head *sh)
return 1;
}
+static void dispatch_bio_list(struct bio_list *tmp)
+{
+ struct bio *bio;
+
+ while ((bio = bio_list_pop(tmp)))
+ generic_make_request(bio);
+}
+
+static int cmp_stripe(void *priv, struct list_head *a, struct list_head *b)
+{
+ const struct r5pending_data *da = list_entry(a,
+ struct r5pending_data, sibling);
+ const struct r5pending_data *db = list_entry(b,
+ struct r5pending_data, sibling);
+ if (da->sector > db->sector)
+ return 1;
+ if (da->sector < db->sector)
+ return -1;
+ return 0;
+}
+
+static void dispatch_defer_bios(struct r5conf *conf, int target,
+ struct bio_list *list)
+{
+ struct r5pending_data *data;
+ struct list_head *first, *next = NULL;
+ int cnt = 0;
+
+ if (conf->pending_data_cnt == 0)
+ return;
+
+ list_sort(NULL, &conf->pending_list, cmp_stripe);
+
+ first = conf->pending_list.next;
+
+ /* temporarily move the head */
+ if (conf->next_pending_data)
+ list_move_tail(&conf->pending_list,
+ &conf->next_pending_data->sibling);
+
+ while (!list_empty(&conf->pending_list)) {
+ data = list_first_entry(&conf->pending_list,
+ struct r5pending_data, sibling);
+ if (&data->sibling == first)
+ first = data->sibling.next;
+ next = data->sibling.next;
+
+ bio_list_merge(list, &data->bios);
+ list_move(&data->sibling, &conf->free_list);
+ cnt++;
+ if (cnt >= target)
+ break;
+ }
+ conf->pending_data_cnt -= cnt;
+ BUG_ON(conf->pending_data_cnt < 0 || cnt < target);
+
+ if (next != &conf->pending_list)
+ conf->next_pending_data = list_entry(next,
+ struct r5pending_data, sibling);
+ else
+ conf->next_pending_data = NULL;
+ /* list isn't empty */
+ if (first != &conf->pending_list)
+ list_move_tail(&conf->pending_list, first);
+}
+
+static void flush_deferred_bios(struct r5conf *conf)
+{
+ struct bio_list tmp = BIO_EMPTY_LIST;
+
+ if (conf->pending_data_cnt == 0)
+ return;
+
+ spin_lock(&conf->pending_bios_lock);
+ dispatch_defer_bios(conf, conf->pending_data_cnt, &tmp);
+ BUG_ON(conf->pending_data_cnt != 0);
+ spin_unlock(&conf->pending_bios_lock);
+
+ dispatch_bio_list(&tmp);
+}
+
+static void defer_issue_bios(struct r5conf *conf, sector_t sector,
+ struct bio_list *bios)
+{
+ struct bio_list tmp = BIO_EMPTY_LIST;
+ struct r5pending_data *ent;
+
+ spin_lock(&conf->pending_bios_lock);
+ ent = list_first_entry(&conf->free_list, struct r5pending_data,
+ sibling);
+ list_move_tail(&ent->sibling, &conf->pending_list);
+ ent->sector = sector;
+ bio_list_init(&ent->bios);
+ bio_list_merge(&ent->bios, bios);
+ conf->pending_data_cnt++;
+ if (conf->pending_data_cnt >= PENDING_IO_MAX)
+ dispatch_defer_bios(conf, PENDING_IO_ONE_FLUSH, &tmp);
+
+ spin_unlock(&conf->pending_bios_lock);
+
+ dispatch_bio_list(&tmp);
+}
+
static void
raid5_end_read_request(struct bio *bi);
static void
@@ -873,21 +984,15 @@ static void ops_run_io(struct stripe_head *sh, struct stripe_head_state *s)
struct r5conf *conf = sh->raid_conf;
int i, disks = sh->disks;
struct stripe_head *head_sh = sh;
+ struct bio_list pending_bios = BIO_EMPTY_LIST;
+ bool should_defer;
might_sleep();
- if (!test_bit(STRIPE_R5C_CACHING, &sh->state)) {
- /* writing out phase */
- if (s->waiting_extra_page)
- return;
- if (r5l_write_stripe(conf->log, sh) == 0)
- return;
- } else { /* caching phase */
- if (test_bit(STRIPE_LOG_TRAPPED, &sh->state)) {
- r5c_cache_data(conf->log, sh, s);
- return;
- }
- }
+ if (log_stripe(sh, s) == 0)
+ return;
+
+ should_defer = conf->batch_bio_dispatch && conf->group_cnt;
for (i = disks; i--; ) {
int op, op_flags = 0;
@@ -1043,7 +1148,10 @@ again:
trace_block_bio_remap(bdev_get_queue(bi->bi_bdev),
bi, disk_devt(conf->mddev->gendisk),
sh->dev[i].sector);
- generic_make_request(bi);
+ if (should_defer && op_is_write(op))
+ bio_list_add(&pending_bios, bi);
+ else
+ generic_make_request(bi);
}
if (rrdev) {
if (s->syncing || s->expanding || s->expanded
@@ -1088,7 +1196,10 @@ again:
trace_block_bio_remap(bdev_get_queue(rbi->bi_bdev),
rbi, disk_devt(conf->mddev->gendisk),
sh->dev[i].sector);
- generic_make_request(rbi);
+ if (should_defer && op_is_write(op))
+ bio_list_add(&pending_bios, rbi);
+ else
+ generic_make_request(rbi);
}
if (!rdev && !rrdev) {
if (op_is_write(op))
@@ -1106,6 +1217,9 @@ again:
if (sh != head_sh)
goto again;
}
+
+ if (should_defer && !bio_list_empty(&pending_bios))
+ defer_issue_bios(conf, head_sh->sector, &pending_bios);
}
static struct dma_async_tx_descriptor *
@@ -1175,7 +1289,6 @@ async_copy_data(int frombio, struct bio *bio, struct page **page,
static void ops_complete_biofill(void *stripe_head_ref)
{
struct stripe_head *sh = stripe_head_ref;
- struct bio_list return_bi = BIO_EMPTY_LIST;
int i;
pr_debug("%s: stripe %llu\n", __func__,
@@ -1199,16 +1312,13 @@ static void ops_complete_biofill(void *stripe_head_ref)
while (rbi && rbi->bi_iter.bi_sector <
dev->sector + STRIPE_SECTORS) {
rbi2 = r5_next_bio(rbi, dev->sector);
- if (!raid5_dec_bi_active_stripes(rbi))
- bio_list_add(&return_bi, rbi);
+ bio_endio(rbi);
rbi = rbi2;
}
}
}
clear_bit(STRIPE_BIOFILL_RUN, &sh->state);
- return_io(&return_bi);
-
set_bit(STRIPE_HANDLE, &sh->state);
raid5_release_stripe(sh);
}
@@ -1364,7 +1474,8 @@ static int set_syndrome_sources(struct page **srcs,
(test_bit(R5_Wantdrain, &dev->flags) ||
test_bit(R5_InJournal, &dev->flags))) ||
(srctype == SYNDROME_SRC_WRITTEN &&
- dev->written)) {
+ (dev->written ||
+ test_bit(R5_InJournal, &dev->flags)))) {
if (test_bit(R5_InJournal, &dev->flags))
srcs[slot] = sh->dev[i].orig_page;
else
@@ -1976,6 +2087,9 @@ static void raid_run_ops(struct stripe_head *sh, unsigned long ops_request)
tx = ops_run_prexor6(sh, percpu, tx);
}
+ if (test_bit(STRIPE_OP_PARTIAL_PARITY, &ops_request))
+ tx = ops_run_partial_parity(sh, percpu, tx);
+
if (test_bit(STRIPE_OP_BIODRAIN, &ops_request)) {
tx = ops_run_biodrain(sh, tx);
overlap_clear++;
@@ -2008,8 +2122,15 @@ static void raid_run_ops(struct stripe_head *sh, unsigned long ops_request)
put_cpu();
}
+static void free_stripe(struct kmem_cache *sc, struct stripe_head *sh)
+{
+ if (sh->ppl_page)
+ __free_page(sh->ppl_page);
+ kmem_cache_free(sc, sh);
+}
+
static struct stripe_head *alloc_stripe(struct kmem_cache *sc, gfp_t gfp,
- int disks)
+ int disks, struct r5conf *conf)
{
struct stripe_head *sh;
int i;
@@ -2023,6 +2144,7 @@ static struct stripe_head *alloc_stripe(struct kmem_cache *sc, gfp_t gfp,
INIT_LIST_HEAD(&sh->r5c);
INIT_LIST_HEAD(&sh->log_list);
atomic_set(&sh->count, 1);
+ sh->raid_conf = conf;
sh->log_start = MaxSector;
for (i = 0; i < disks; i++) {
struct r5dev *dev = &sh->dev[i];
@@ -2030,6 +2152,14 @@ static struct stripe_head *alloc_stripe(struct kmem_cache *sc, gfp_t gfp,
bio_init(&dev->req, &dev->vec, 1);
bio_init(&dev->rreq, &dev->rvec, 1);
}
+
+ if (raid5_has_ppl(conf)) {
+ sh->ppl_page = alloc_page(gfp);
+ if (!sh->ppl_page) {
+ free_stripe(sc, sh);
+ sh = NULL;
+ }
+ }
}
return sh;
}
@@ -2037,15 +2167,13 @@ static int grow_one_stripe(struct r5conf *conf, gfp_t gfp)
{
struct stripe_head *sh;
- sh = alloc_stripe(conf->slab_cache, gfp, conf->pool_size);
+ sh = alloc_stripe(conf->slab_cache, gfp, conf->pool_size, conf);
if (!sh)
return 0;
- sh->raid_conf = conf;
-
if (grow_buffers(sh, gfp)) {
shrink_buffers(sh);
- kmem_cache_free(conf->slab_cache, sh);
+ free_stripe(conf->slab_cache, sh);
return 0;
}
sh->hash_lock_index =
@@ -2172,7 +2300,7 @@ static int resize_stripes(struct r5conf *conf, int newsize)
* pages have been transferred over, and the old kmem_cache is
* freed when all stripes are done.
* 3/ reallocate conf->disks to be suitable bigger. If this fails,
- * we simple return a failre status - no need to clean anything up.
+ * we simple return a failure status - no need to clean anything up.
* 4/ allocate new pages for the new slots in the new stripe_heads.
* If this fails, we don't bother trying the shrink the
* stripe_heads down again, we just leave them as they are.
@@ -2185,17 +2313,12 @@ static int resize_stripes(struct r5conf *conf, int newsize)
struct stripe_head *osh, *nsh;
LIST_HEAD(newstripes);
struct disk_info *ndisks;
- int err;
+ int err = 0;
struct kmem_cache *sc;
int i;
int hash, cnt;
- if (newsize <= conf->pool_size)
- return 0; /* never bother to shrink */
-
- err = md_allow_write(conf->mddev);
- if (err)
- return err;
+ md_allow_write(conf->mddev);
/* Step 1 */
sc = kmem_cache_create(conf->cache_name[1-conf->active_name],
@@ -2208,11 +2331,10 @@ static int resize_stripes(struct r5conf *conf, int newsize)
mutex_lock(&conf->cache_size_mutex);
for (i = conf->max_nr_stripes; i; i--) {
- nsh = alloc_stripe(sc, GFP_KERNEL, newsize);
+ nsh = alloc_stripe(sc, GFP_KERNEL, newsize, conf);
if (!nsh)
break;
- nsh->raid_conf = conf;
list_add(&nsh->lru, &newstripes);
}
if (i) {
@@ -2220,7 +2342,7 @@ static int resize_stripes(struct r5conf *conf, int newsize)
while (!list_empty(&newstripes)) {
nsh = list_entry(newstripes.next, struct stripe_head, lru);
list_del(&nsh->lru);
- kmem_cache_free(sc, nsh);
+ free_stripe(sc, nsh);
}
kmem_cache_destroy(sc);
mutex_unlock(&conf->cache_size_mutex);
@@ -2246,7 +2368,7 @@ static int resize_stripes(struct r5conf *conf, int newsize)
nsh->dev[i].orig_page = osh->dev[i].page;
}
nsh->hash_lock_index = hash;
- kmem_cache_free(conf->slab_cache, osh);
+ free_stripe(conf->slab_cache, osh);
cnt++;
if (cnt >= conf->max_nr_stripes / NR_STRIPE_HASH_LOCKS +
!!((conf->max_nr_stripes % NR_STRIPE_HASH_LOCKS) > hash)) {
@@ -2285,6 +2407,10 @@ static int resize_stripes(struct r5conf *conf, int newsize)
err = -ENOMEM;
mutex_unlock(&conf->cache_size_mutex);
+
+ conf->slab_cache = sc;
+ conf->active_name = 1-conf->active_name;
+
/* Step 4, return new stripes to service */
while(!list_empty(&newstripes)) {
nsh = list_entry(newstripes.next, struct stripe_head, lru);
@@ -2302,8 +2428,6 @@ static int resize_stripes(struct r5conf *conf, int newsize)
}
/* critical section pass, GFP_NOIO no longer needed */
- conf->slab_cache = sc;
- conf->active_name = 1-conf->active_name;
if (!err)
conf->pool_size = newsize;
return err;
@@ -2321,7 +2445,7 @@ static int drop_one_stripe(struct r5conf *conf)
return 0;
BUG_ON(atomic_read(&sh->count));
shrink_buffers(sh);
- kmem_cache_free(conf->slab_cache, sh);
+ free_stripe(conf->slab_cache, sh);
atomic_dec(&conf->active_stripes);
conf->max_nr_stripes--;
return 1;
@@ -2352,7 +2476,7 @@ static void raid5_end_read_request(struct bio * bi)
pr_debug("end_read_request %llu/%d, count: %d, error %d.\n",
(unsigned long long)sh->sector, i, atomic_read(&sh->count),
- bi->bi_error);
+ bi->bi_status);
if (i == disks) {
bio_reset(bi);
BUG();
@@ -2372,7 +2496,7 @@ static void raid5_end_read_request(struct bio * bi)
s = sh->sector + rdev->new_data_offset;
else
s = sh->sector + rdev->data_offset;
- if (!bi->bi_error) {
+ if (!bi->bi_status) {
set_bit(R5_UPTODATE, &sh->dev[i].flags);
if (test_bit(R5_ReadError, &sh->dev[i].flags)) {
/* Note that this cannot happen on a
@@ -2489,7 +2613,7 @@ static void raid5_end_write_request(struct bio *bi)
}
pr_debug("end_write_request %llu/%d, count %d, error: %d.\n",
(unsigned long long)sh->sector, i, atomic_read(&sh->count),
- bi->bi_error);
+ bi->bi_status);
if (i == disks) {
bio_reset(bi);
BUG();
@@ -2497,14 +2621,14 @@ static void raid5_end_write_request(struct bio *bi)
}
if (replacement) {
- if (bi->bi_error)
+ if (bi->bi_status)
md_error(conf->mddev, rdev);
else if (is_badblock(rdev, sh->sector,
STRIPE_SECTORS,
&first_bad, &bad_sectors))
set_bit(R5_MadeGoodRepl, &sh->dev[i].flags);
} else {
- if (bi->bi_error) {
+ if (bi->bi_status) {
set_bit(STRIPE_DEGRADED, &sh->state);
set_bit(WriteErrorSeen, &rdev->flags);
set_bit(R5_WriteError, &sh->dev[i].flags);
@@ -2525,7 +2649,7 @@ static void raid5_end_write_request(struct bio *bi)
}
rdev_dec_pending(rdev, conf->mddev);
- if (sh->batch_head && bi->bi_error && !replacement)
+ if (sh->batch_head && bi->bi_status && !replacement)
set_bit(STRIPE_BATCH_ERR, &sh->batch_head->state);
bio_reset(bi);
@@ -2569,7 +2693,7 @@ static void raid5_error(struct mddev *mddev, struct md_rdev *rdev)
bdevname(rdev->bdev, b),
mdname(mddev),
conf->raid_disks - mddev->degraded);
- r5c_update_on_rdev_error(mddev);
+ r5c_update_on_rdev_error(mddev, rdev);
}
/*
@@ -2914,12 +3038,44 @@ sector_t raid5_compute_blocknr(struct stripe_head *sh, int i, int previous)
* like to flush data in journal to RAID disks first, so complex rmw
* is handled in the write patch (handle_stripe_dirtying).
*
+ * 2. when journal space is critical (R5C_LOG_CRITICAL=1)
+ *
+ * It is important to be able to flush all stripes in raid5-cache.
+ * Therefore, we need reserve some space on the journal device for
+ * these flushes. If flush operation includes pending writes to the
+ * stripe, we need to reserve (conf->raid_disk + 1) pages per stripe
+ * for the flush out. If we exclude these pending writes from flush
+ * operation, we only need (conf->max_degraded + 1) pages per stripe.
+ * Therefore, excluding pending writes in these cases enables more
+ * efficient use of the journal device.
+ *
+ * Note: To make sure the stripe makes progress, we only delay
+ * towrite for stripes with data already in journal (injournal > 0).
+ * When LOG_CRITICAL, stripes with injournal == 0 will be sent to
+ * no_space_stripes list.
+ *
+ * 3. during journal failure
+ * In journal failure, we try to flush all cached data to raid disks
+ * based on data in stripe cache. The array is read-only to upper
+ * layers, so we would skip all pending writes.
+ *
*/
-static inline bool delay_towrite(struct r5dev *dev,
- struct stripe_head_state *s)
-{
- return !test_bit(R5_OVERWRITE, &dev->flags) &&
- !test_bit(R5_Insync, &dev->flags) && s->injournal;
+static inline bool delay_towrite(struct r5conf *conf,
+ struct r5dev *dev,
+ struct stripe_head_state *s)
+{
+ /* case 1 above */
+ if (!test_bit(R5_OVERWRITE, &dev->flags) &&
+ !test_bit(R5_Insync, &dev->flags) && s->injournal)
+ return true;
+ /* case 2 above */
+ if (test_bit(R5C_LOG_CRITICAL, &conf->cache_state) &&
+ s->injournal > 0)
+ return true;
+ /* case 3 above */
+ if (s->log_failed && s->injournal)
+ return true;
+ return false;
}
static void
@@ -2942,7 +3098,7 @@ schedule_reconstruction(struct stripe_head *sh, struct stripe_head_state *s,
for (i = disks; i--; ) {
struct r5dev *dev = &sh->dev[i];
- if (dev->towrite && !delay_towrite(dev, s)) {
+ if (dev->towrite && !delay_towrite(conf, dev, s)) {
set_bit(R5_LOCKED, &dev->flags);
set_bit(R5_Wantdrain, &dev->flags);
if (!expand)
@@ -3020,6 +3176,12 @@ schedule_reconstruction(struct stripe_head *sh, struct stripe_head_state *s,
s->locked++;
}
+ if (raid5_has_ppl(sh->raid_conf) && sh->ppl_page &&
+ test_bit(STRIPE_OP_BIODRAIN, &s->ops_request) &&
+ !test_bit(STRIPE_FULL_WRITE, &sh->state) &&
+ test_bit(R5_Insync, &sh->dev[pd_idx].flags))
+ set_bit(STRIPE_OP_PARTIAL_PARITY, &s->ops_request);
+
pr_debug("%s: stripe %llu locked: %d ops_request: %lx\n",
__func__, (unsigned long long)sh->sector,
s->locked, s->ops_request);
@@ -3041,14 +3203,6 @@ static int add_stripe_bio(struct stripe_head *sh, struct bio *bi, int dd_idx,
(unsigned long long)bi->bi_iter.bi_sector,
(unsigned long long)sh->sector);
- /*
- * If several bio share a stripe. The bio bi_phys_segments acts as a
- * reference count to avoid race. The reference count should already be
- * increased before this function is called (for example, in
- * raid5_make_request()), so other bio sharing this stripe will not free the
- * stripe. If a stripe is owned by one stripe, the stripe lock will
- * protect it.
- */
spin_lock_irq(&sh->stripe_lock);
/* Don't allow new IO added to stripes in batch list */
if (sh->batch_head)
@@ -3067,6 +3221,36 @@ static int add_stripe_bio(struct stripe_head *sh, struct bio *bi, int dd_idx,
if (*bip && (*bip)->bi_iter.bi_sector < bio_end_sector(bi))
goto overlap;
+ if (forwrite && raid5_has_ppl(conf)) {
+ /*
+ * With PPL only writes to consecutive data chunks within a
+ * stripe are allowed because for a single stripe_head we can
+ * only have one PPL entry at a time, which describes one data
+ * range. Not really an overlap, but wait_for_overlap can be
+ * used to handle this.
+ */
+ sector_t sector;
+ sector_t first = 0;
+ sector_t last = 0;
+ int count = 0;
+ int i;
+
+ for (i = 0; i < sh->disks; i++) {
+ if (i != sh->pd_idx &&
+ (i == dd_idx || sh->dev[i].towrite)) {
+ sector = sh->dev[i].sector;
+ if (count == 0 || sector < first)
+ first = sector;
+ if (sector > last)
+ last = sector;
+ count++;
+ }
+ }
+
+ if (first + conf->chunk_sectors * (count - 1) != last)
+ goto overlap;
+ }
+
if (!forwrite || previous)
clear_bit(STRIPE_BATCH_READY, &sh->state);
@@ -3074,7 +3258,8 @@ static int add_stripe_bio(struct stripe_head *sh, struct bio *bi, int dd_idx,
if (*bip)
bi->bi_next = *bip;
*bip = bi;
- raid5_inc_bi_active_stripes(bi);
+ bio_inc_remaining(bi);
+ md_write_inc(conf->mddev, bi);
if (forwrite) {
/* check if page is covered */
@@ -3151,8 +3336,7 @@ static void stripe_set_idx(sector_t stripe, struct r5conf *conf, int previous,
static void
handle_failed_stripe(struct r5conf *conf, struct stripe_head *sh,
- struct stripe_head_state *s, int disks,
- struct bio_list *return_bi)
+ struct stripe_head_state *s, int disks)
{
int i;
BUG_ON(sh->batch_head);
@@ -3188,7 +3372,7 @@ handle_failed_stripe(struct r5conf *conf, struct stripe_head *sh,
if (bi)
bitmap_end = 1;
- r5l_stripe_write_finished(sh);
+ log_stripe_write_finished(sh);
if (test_and_clear_bit(R5_Overlap, &sh->dev[i].flags))
wake_up(&conf->wait_for_overlap);
@@ -3197,11 +3381,9 @@ handle_failed_stripe(struct r5conf *conf, struct stripe_head *sh,
sh->dev[i].sector + STRIPE_SECTORS) {
struct bio *nextbi = r5_next_bio(bi, sh->dev[i].sector);
- bi->bi_error = -EIO;
- if (!raid5_dec_bi_active_stripes(bi)) {
- md_write_end(conf->mddev);
- bio_list_add(return_bi, bi);
- }
+ bi->bi_status = BLK_STS_IOERR;
+ md_write_end(conf->mddev);
+ bio_endio(bi);
bi = nextbi;
}
if (bitmap_end)
@@ -3221,11 +3403,9 @@ handle_failed_stripe(struct r5conf *conf, struct stripe_head *sh,
sh->dev[i].sector + STRIPE_SECTORS) {
struct bio *bi2 = r5_next_bio(bi, sh->dev[i].sector);
- bi->bi_error = -EIO;
- if (!raid5_dec_bi_active_stripes(bi)) {
- md_write_end(conf->mddev);
- bio_list_add(return_bi, bi);
- }
+ bi->bi_status = BLK_STS_IOERR;
+ md_write_end(conf->mddev);
+ bio_endio(bi);
bi = bi2;
}
@@ -3249,9 +3429,8 @@ handle_failed_stripe(struct r5conf *conf, struct stripe_head *sh,
struct bio *nextbi =
r5_next_bio(bi, sh->dev[i].sector);
- bi->bi_error = -EIO;
- if (!raid5_dec_bi_active_stripes(bi))
- bio_list_add(return_bi, bi);
+ bi->bi_status = BLK_STS_IOERR;
+ bio_endio(bi);
bi = nextbi;
}
}
@@ -3387,7 +3566,7 @@ static int need_this_block(struct stripe_head *sh, struct stripe_head_state *s,
!test_bit(STRIPE_PREREAD_ACTIVE, &sh->state))
/* Pre-reads at not permitted until after short delay
* to gather multiple requests. However if this
- * device is no Insync, the block could only be be computed
+ * device is no Insync, the block could only be computed
* and there is no need to delay that.
*/
return 0;
@@ -3406,7 +3585,7 @@ static int need_this_block(struct stripe_head *sh, struct stripe_head_state *s,
/* If we are forced to do a reconstruct-write, either because
* the current RAID6 implementation only supports that, or
- * or because parity cannot be trusted and we are currently
+ * because parity cannot be trusted and we are currently
* recovering it, there is extra need to be careful.
* If one of the devices that we would need to read, because
* it is not being overwritten (and maybe not written at all)
@@ -3446,9 +3625,20 @@ static int fetch_block(struct stripe_head *sh, struct stripe_head_state *s,
BUG_ON(test_bit(R5_Wantcompute, &dev->flags));
BUG_ON(test_bit(R5_Wantread, &dev->flags));
BUG_ON(sh->batch_head);
+
+ /*
+ * In the raid6 case if the only non-uptodate disk is P
+ * then we already trusted P to compute the other failed
+ * drives. It is safe to compute rather than re-read P.
+ * In other cases we only compute blocks from failed
+ * devices, otherwise check/repair might fail to detect
+ * a real inconsistency.
+ */
+
if ((s->uptodate == disks - 1) &&
+ ((sh->qd_idx >= 0 && sh->pd_idx == disk_idx) ||
(s->failed && (disk_idx == s->failed_num[0] ||
- disk_idx == s->failed_num[1]))) {
+ disk_idx == s->failed_num[1])))) {
/* have disk failed, and we're requested to fetch it;
* do compute it
*/
@@ -3550,7 +3740,7 @@ static void break_stripe_batch_list(struct stripe_head *head_sh,
* never LOCKED, so we don't need to test 'failed' directly.
*/
static void handle_stripe_clean_event(struct r5conf *conf,
- struct stripe_head *sh, int disks, struct bio_list *return_bi)
+ struct stripe_head *sh, int disks)
{
int i;
struct r5dev *dev;
@@ -3582,10 +3772,8 @@ returnbi:
while (wbi && wbi->bi_iter.bi_sector <
dev->sector + STRIPE_SECTORS) {
wbi2 = r5_next_bio(wbi, dev->sector);
- if (!raid5_dec_bi_active_stripes(wbi)) {
- md_write_end(conf->mddev);
- bio_list_add(return_bi, wbi);
- }
+ md_write_end(conf->mddev);
+ bio_endio(wbi);
wbi = wbi2;
}
bitmap_endwrite(conf->mddev->bitmap, sh->sector,
@@ -3607,7 +3795,7 @@ returnbi:
discard_pending = 1;
}
- r5l_stripe_write_finished(sh);
+ log_stripe_write_finished(sh);
if (!discard_pending &&
test_bit(R5_Discard, &sh->dev[sh->pd_idx].flags)) {
@@ -3694,7 +3882,7 @@ static int handle_stripe_dirtying(struct r5conf *conf,
} else for (i = disks; i--; ) {
/* would I have to read this buffer for read_modify_write */
struct r5dev *dev = &sh->dev[i];
- if (((dev->towrite && !delay_towrite(dev, s)) ||
+ if (((dev->towrite && !delay_towrite(conf, dev, s)) ||
i == sh->pd_idx || i == sh->qd_idx ||
test_bit(R5_InJournal, &dev->flags)) &&
!test_bit(R5_LOCKED, &dev->flags) &&
@@ -3718,8 +3906,8 @@ static int handle_stripe_dirtying(struct r5conf *conf,
}
}
- pr_debug("for sector %llu, rmw=%d rcw=%d\n",
- (unsigned long long)sh->sector, rmw, rcw);
+ pr_debug("for sector %llu state 0x%lx, rmw=%d rcw=%d\n",
+ (unsigned long long)sh->sector, sh->state, rmw, rcw);
set_bit(STRIPE_HANDLE, &sh->state);
if ((rmw < rcw || (rmw == rcw && conf->rmw_level == PARITY_PREFER_RMW)) && rmw > 0) {
/* prefer read-modify-write, but need to get some data */
@@ -3759,7 +3947,7 @@ static int handle_stripe_dirtying(struct r5conf *conf,
for (i = disks; i--; ) {
struct r5dev *dev = &sh->dev[i];
- if (((dev->towrite && !delay_towrite(dev, s)) ||
+ if (((dev->towrite && !delay_towrite(conf, dev, s)) ||
i == sh->pd_idx || i == sh->qd_idx ||
test_bit(R5_InJournal, &dev->flags)) &&
!test_bit(R5_LOCKED, &dev->flags) &&
@@ -3897,10 +4085,15 @@ static void handle_parity_checks5(struct r5conf *conf, struct stripe_head *sh,
set_bit(STRIPE_INSYNC, &sh->state);
else {
atomic64_add(STRIPE_SECTORS, &conf->mddev->resync_mismatches);
- if (test_bit(MD_RECOVERY_CHECK, &conf->mddev->recovery))
+ if (test_bit(MD_RECOVERY_CHECK, &conf->mddev->recovery)) {
/* don't try to repair!! */
set_bit(STRIPE_INSYNC, &sh->state);
- else {
+ pr_warn_ratelimited("%s: mismatch sector in range "
+ "%llu-%llu\n", mdname(conf->mddev),
+ (unsigned long long) sh->sector,
+ (unsigned long long) sh->sector +
+ STRIPE_SECTORS);
+ } else {
sh->check_state = check_state_compute_run;
set_bit(STRIPE_COMPUTE_RUN, &sh->state);
set_bit(STRIPE_OP_COMPUTE_BLK, &s->ops_request);
@@ -4049,10 +4242,15 @@ static void handle_parity_checks6(struct r5conf *conf, struct stripe_head *sh,
}
} else {
atomic64_add(STRIPE_SECTORS, &conf->mddev->resync_mismatches);
- if (test_bit(MD_RECOVERY_CHECK, &conf->mddev->recovery))
+ if (test_bit(MD_RECOVERY_CHECK, &conf->mddev->recovery)) {
/* don't try to repair!! */
set_bit(STRIPE_INSYNC, &sh->state);
- else {
+ pr_warn_ratelimited("%s: mismatch sector in range "
+ "%llu-%llu\n", mdname(conf->mddev),
+ (unsigned long long) sh->sector,
+ (unsigned long long) sh->sector +
+ STRIPE_SECTORS);
+ } else {
int *target = &sh->ops.target;
sh->ops.target = -1;
@@ -4472,8 +4670,13 @@ static void handle_stripe(struct stripe_head *sh)
if (test_bit(STRIPE_SYNC_REQUESTED, &sh->state) && !sh->batch_head) {
spin_lock(&sh->stripe_lock);
- /* Cannot process 'sync' concurrently with 'discard' */
- if (!test_bit(STRIPE_DISCARD, &sh->state) &&
+ /*
+ * Cannot process 'sync' concurrently with 'discard'.
+ * Flush data in r5cache before 'sync'.
+ */
+ if (!test_bit(STRIPE_R5C_PARTIAL_STRIPE, &sh->state) &&
+ !test_bit(STRIPE_R5C_FULL_STRIPE, &sh->state) &&
+ !test_bit(STRIPE_DISCARD, &sh->state) &&
test_and_clear_bit(STRIPE_SYNC_REQUESTED, &sh->state)) {
set_bit(STRIPE_SYNCING, &sh->state);
clear_bit(STRIPE_INSYNC, &sh->state);
@@ -4494,7 +4697,8 @@ static void handle_stripe(struct stripe_head *sh)
if (test_bit(STRIPE_LOG_TRAPPED, &sh->state))
goto finish;
- if (s.handle_bad_blocks) {
+ if (s.handle_bad_blocks ||
+ test_bit(MD_SB_CHANGE_PENDING, &conf->mddev->sb_flags)) {
set_bit(STRIPE_HANDLE, &sh->state);
goto finish;
}
@@ -4519,15 +4723,20 @@ static void handle_stripe(struct stripe_head *sh)
" to_write=%d failed=%d failed_num=%d,%d\n",
s.locked, s.uptodate, s.to_read, s.to_write, s.failed,
s.failed_num[0], s.failed_num[1]);
- /* check if the array has lost more than max_degraded devices and,
+ /*
+ * check if the array has lost more than max_degraded devices and,
* if so, some requests might need to be failed.
+ *
+ * When journal device failed (log_failed), we will only process
+ * the stripe if there is data need write to raid disks
*/
- if (s.failed > conf->max_degraded || s.log_failed) {
+ if (s.failed > conf->max_degraded ||
+ (s.log_failed && s.injournal == 0)) {
sh->check_state = 0;
sh->reconstruct_state = 0;
break_stripe_batch_list(sh, 0);
if (s.to_read+s.to_write+s.written)
- handle_failed_stripe(conf, sh, &s, disks, &s.return_bi);
+ handle_failed_stripe(conf, sh, &s, disks);
if (s.syncing + s.replacing)
handle_failed_sync(conf, sh, &s);
}
@@ -4593,11 +4802,11 @@ static void handle_stripe(struct stripe_head *sh)
&& !test_bit(R5_LOCKED, &qdev->flags)
&& (test_bit(R5_UPTODATE, &qdev->flags) ||
test_bit(R5_Discard, &qdev->flags))))))
- handle_stripe_clean_event(conf, sh, disks, &s.return_bi);
+ handle_stripe_clean_event(conf, sh, disks);
if (s.just_cached)
- r5c_handle_cached_data_endio(conf, sh, disks, &s.return_bi);
- r5l_stripe_write_finished(sh);
+ r5c_handle_cached_data_endio(conf, sh, disks);
+ log_stripe_write_finished(sh);
/* Now we might consider reading some blocks, either to check/generate
* parity, or to satisfy requests
@@ -4824,16 +5033,6 @@ finish:
md_wakeup_thread(conf->mddev->thread);
}
- if (!bio_list_empty(&s.return_bi)) {
- if (test_bit(MD_SB_CHANGE_PENDING, &conf->mddev->sb_flags)) {
- spin_lock_irq(&conf->device_lock);
- bio_list_merge(&conf->return_bi, &s.return_bi);
- spin_unlock_irq(&conf->device_lock);
- md_wakeup_thread(conf->mddev->thread);
- } else
- return_io(&s.return_bi);
- }
-
clear_bit_unlock(STRIPE_ACTIVE, &sh->state);
}
@@ -4922,12 +5121,14 @@ static void add_bio_to_retry(struct bio *bi,struct r5conf *conf)
md_wakeup_thread(conf->mddev->thread);
}
-static struct bio *remove_bio_from_retry(struct r5conf *conf)
+static struct bio *remove_bio_from_retry(struct r5conf *conf,
+ unsigned int *offset)
{
struct bio *bi;
bi = conf->retry_read_aligned;
if (bi) {
+ *offset = conf->retry_read_offset;
conf->retry_read_aligned = NULL;
return bi;
}
@@ -4935,11 +5136,7 @@ static struct bio *remove_bio_from_retry(struct r5conf *conf)
if(bi) {
conf->retry_read_aligned_list = bi->bi_next;
bi->bi_next = NULL;
- /*
- * this sets the active strip count to 1 and the processed
- * strip count to zero (upper 8 bits)
- */
- raid5_set_bi_stripes(bi, 1); /* biased count of active stripes */
+ *offset = 0;
}
return bi;
@@ -4957,7 +5154,7 @@ static void raid5_align_endio(struct bio *bi)
struct mddev *mddev;
struct r5conf *conf;
struct md_rdev *rdev;
- int error = bi->bi_error;
+ blk_status_t error = bi->bi_status;
bio_put(bi);
@@ -4969,8 +5166,6 @@ static void raid5_align_endio(struct bio *bi)
rdev_dec_pending(rdev, conf->mddev);
if (!error) {
- trace_block_bio_complete(bdev_get_queue(raid_bi->bi_bdev),
- raid_bi, 0);
bio_endio(raid_bi);
if (atomic_dec_and_test(&conf->active_aligned_reads))
wake_up(&conf->wait_for_quiescent);
@@ -4995,9 +5190,9 @@ static int raid5_read_one_chunk(struct mddev *mddev, struct bio *raid_bio)
return 0;
}
/*
- * use bio_clone_mddev to make a copy of the bio
+ * use bio_clone_fast to make a copy of the bio
*/
- align_bi = bio_clone_mddev(raid_bio, GFP_NOIO, mddev);
+ align_bi = bio_clone_fast(raid_bio, GFP_NOIO, mddev->bio_set);
if (!align_bi)
return 0;
/*
@@ -5025,6 +5220,13 @@ static int raid5_read_one_chunk(struct mddev *mddev, struct bio *raid_bio)
rdev->recovery_offset >= end_sector)))
rdev = NULL;
}
+
+ if (r5c_big_stripe_cached(conf, align_bi->bi_iter.bi_sector)) {
+ rcu_read_unlock();
+ bio_put(align_bi);
+ return 0;
+ }
+
if (rdev) {
sector_t first_bad;
int bad_sectors;
@@ -5069,24 +5271,20 @@ static int raid5_read_one_chunk(struct mddev *mddev, struct bio *raid_bio)
static struct bio *chunk_aligned_read(struct mddev *mddev, struct bio *raid_bio)
{
struct bio *split;
+ sector_t sector = raid_bio->bi_iter.bi_sector;
+ unsigned chunk_sects = mddev->chunk_sectors;
+ unsigned sectors = chunk_sects - (sector & (chunk_sects-1));
- do {
- sector_t sector = raid_bio->bi_iter.bi_sector;
- unsigned chunk_sects = mddev->chunk_sectors;
- unsigned sectors = chunk_sects - (sector & (chunk_sects-1));
-
- if (sectors < bio_sectors(raid_bio)) {
- split = bio_split(raid_bio, sectors, GFP_NOIO, fs_bio_set);
- bio_chain(split, raid_bio);
- } else
- split = raid_bio;
+ if (sectors < bio_sectors(raid_bio)) {
+ struct r5conf *conf = mddev->private;
+ split = bio_split(raid_bio, sectors, GFP_NOIO, conf->bio_split);
+ bio_chain(split, raid_bio);
+ generic_make_request(raid_bio);
+ raid_bio = split;
+ }
- if (!raid5_read_one_chunk(mddev, split)) {
- if (split != raid_bio)
- generic_make_request(raid_bio);
- return split;
- }
- } while (split != raid_bio);
+ if (!raid5_read_one_chunk(mddev, raid_bio))
+ return raid_bio;
return NULL;
}
@@ -5103,19 +5301,29 @@ static struct bio *chunk_aligned_read(struct mddev *mddev, struct bio *raid_bio)
*/
static struct stripe_head *__get_priority_stripe(struct r5conf *conf, int group)
{
- struct stripe_head *sh = NULL, *tmp;
+ struct stripe_head *sh, *tmp;
struct list_head *handle_list = NULL;
- struct r5worker_group *wg = NULL;
+ struct r5worker_group *wg;
+ bool second_try = !r5c_is_writeback(conf->log) &&
+ !r5l_log_disk_error(conf);
+ bool try_loprio = test_bit(R5C_LOG_TIGHT, &conf->cache_state) ||
+ r5l_log_disk_error(conf);
+again:
+ wg = NULL;
+ sh = NULL;
if (conf->worker_cnt_per_group == 0) {
- handle_list = &conf->handle_list;
+ handle_list = try_loprio ? &conf->loprio_list :
+ &conf->handle_list;
} else if (group != ANY_GROUP) {
- handle_list = &conf->worker_groups[group].handle_list;
+ handle_list = try_loprio ? &conf->worker_groups[group].loprio_list :
+ &conf->worker_groups[group].handle_list;
wg = &conf->worker_groups[group];
} else {
int i;
for (i = 0; i < conf->group_cnt; i++) {
- handle_list = &conf->worker_groups[i].handle_list;
+ handle_list = try_loprio ? &conf->worker_groups[i].loprio_list :
+ &conf->worker_groups[i].handle_list;
wg = &conf->worker_groups[i];
if (!list_empty(handle_list))
break;
@@ -5166,8 +5374,13 @@ static struct stripe_head *__get_priority_stripe(struct r5conf *conf, int group)
wg = NULL;
}
- if (!sh)
- return NULL;
+ if (!sh) {
+ if (second_try)
+ return NULL;
+ second_try = true;
+ try_loprio = !try_loprio;
+ goto again;
+ }
if (wg) {
wg->stripes_cnt--;
@@ -5256,7 +5469,6 @@ static void make_discard_request(struct mddev *mddev, struct bio *bi)
struct r5conf *conf = mddev->private;
sector_t logical_sector, last_sector;
struct stripe_head *sh;
- int remaining;
int stripe_sectors;
if (mddev->reshape_position != MaxSector)
@@ -5267,7 +5479,6 @@ static void make_discard_request(struct mddev *mddev, struct bio *bi)
last_sector = bi->bi_iter.bi_sector + (bi->bi_iter.bi_size>>9);
bi->bi_next = NULL;
- bi->bi_phys_segments = 1; /* over-loaded to count active stripes */
stripe_sectors = conf->chunk_sectors *
(conf->raid_disks - conf->max_degraded);
@@ -5313,7 +5524,8 @@ static void make_discard_request(struct mddev *mddev, struct bio *bi)
continue;
sh->dev[d].towrite = bi;
set_bit(R5_OVERWRITE, &sh->dev[d].flags);
- raid5_inc_bi_active_stripes(bi);
+ bio_inc_remaining(bi);
+ md_write_inc(mddev, bi);
sh->overwrite_disks++;
}
spin_unlock_irq(&sh->stripe_lock);
@@ -5336,14 +5548,10 @@ static void make_discard_request(struct mddev *mddev, struct bio *bi)
release_stripe_plug(mddev, sh);
}
- remaining = raid5_dec_bi_active_stripes(bi);
- if (remaining == 0) {
- md_write_end(mddev);
- bio_endio(bi);
- }
+ bio_endio(bi);
}
-static void raid5_make_request(struct mddev *mddev, struct bio * bi)
+static bool raid5_make_request(struct mddev *mddev, struct bio * bi)
{
struct r5conf *conf = mddev->private;
int dd_idx;
@@ -5351,7 +5559,6 @@ static void raid5_make_request(struct mddev *mddev, struct bio * bi)
sector_t logical_sector, last_sector;
struct stripe_head *sh;
const int rw = bio_data_dir(bi);
- int remaining;
DEFINE_WAIT(w);
bool do_prepare;
bool do_flush = false;
@@ -5360,10 +5567,10 @@ static void raid5_make_request(struct mddev *mddev, struct bio * bi)
int ret = r5l_handle_flush_request(conf->log, bi);
if (ret == 0)
- return;
+ return true;
if (ret == -ENODEV) {
md_flush_request(mddev, bi);
- return;
+ return true;
}
/* ret == -EAGAIN, fallback */
/*
@@ -5373,30 +5580,29 @@ static void raid5_make_request(struct mddev *mddev, struct bio * bi)
do_flush = bi->bi_opf & REQ_PREFLUSH;
}
- md_write_start(mddev, bi);
-
+ if (!md_write_start(mddev, bi))
+ return false;
/*
* If array is degraded, better not do chunk aligned read because
* later we might have to read it again in order to reconstruct
* data on failed drives.
*/
if (rw == READ && mddev->degraded == 0 &&
- !r5c_is_writeback(conf->log) &&
mddev->reshape_position == MaxSector) {
bi = chunk_aligned_read(mddev, bi);
if (!bi)
- return;
+ return true;
}
if (unlikely(bio_op(bi) == REQ_OP_DISCARD)) {
make_discard_request(mddev, bi);
- return;
+ md_write_end(mddev);
+ return true;
}
logical_sector = bi->bi_iter.bi_sector & ~((sector_t)STRIPE_SECTORS-1);
last_sector = bio_end_sector(bi);
bi->bi_next = NULL;
- bi->bi_phys_segments = 1; /* over-loaded to count active stripes */
prepare_to_wait(&conf->wait_for_overlap, &w, TASK_UNINTERRUPTIBLE);
for (;logical_sector < last_sector; logical_sector += STRIPE_SECTORS) {
@@ -5487,12 +5693,15 @@ static void raid5_make_request(struct mddev *mddev, struct bio * bi)
* userspace, we want an interruptible
* wait.
*/
- flush_signals(current);
prepare_to_wait(&conf->wait_for_overlap,
&w, TASK_INTERRUPTIBLE);
if (logical_sector >= mddev->suspend_lo &&
logical_sector < mddev->suspend_hi) {
+ sigset_t full, old;
+ sigfillset(&full);
+ sigprocmask(SIG_BLOCK, &full, &old);
schedule();
+ sigprocmask(SIG_SETMASK, &old, NULL);
do_prepare = true;
}
goto retry;
@@ -5525,22 +5734,16 @@ static void raid5_make_request(struct mddev *mddev, struct bio * bi)
release_stripe_plug(mddev, sh);
} else {
/* cannot get stripe for read-ahead, just give-up */
- bi->bi_error = -EIO;
+ bi->bi_status = BLK_STS_IOERR;
break;
}
}
finish_wait(&conf->wait_for_overlap, &w);
- remaining = raid5_dec_bi_active_stripes(bi);
- if (remaining == 0) {
-
- if ( rw == WRITE )
- md_write_end(mddev);
-
- trace_block_bio_complete(bdev_get_queue(bi->bi_bdev),
- bi, 0);
- bio_endio(bi);
- }
+ if (rw == WRITE)
+ md_write_end(mddev);
+ bio_endio(bi);
+ return true;
}
static sector_t raid5_size(struct mddev *mddev, sector_t sectors, int raid_disks);
@@ -5889,7 +6092,8 @@ static inline sector_t raid5_sync_request(struct mddev *mddev, sector_t sector_n
return STRIPE_SECTORS;
}
-static int retry_aligned_read(struct r5conf *conf, struct bio *raid_bio)
+static int retry_aligned_read(struct r5conf *conf, struct bio *raid_bio,
+ unsigned int offset)
{
/* We may not be able to submit a whole bio at once as there
* may not be enough stripe_heads available.
@@ -5905,7 +6109,6 @@ static int retry_aligned_read(struct r5conf *conf, struct bio *raid_bio)
int dd_idx;
sector_t sector, logical_sector, last_sector;
int scnt = 0;
- int remaining;
int handled = 0;
logical_sector = raid_bio->bi_iter.bi_sector &
@@ -5919,7 +6122,7 @@ static int retry_aligned_read(struct r5conf *conf, struct bio *raid_bio)
sector += STRIPE_SECTORS,
scnt++) {
- if (scnt < raid5_bi_processed_stripes(raid_bio))
+ if (scnt < offset)
/* already done this stripe */
continue;
@@ -5927,15 +6130,15 @@ static int retry_aligned_read(struct r5conf *conf, struct bio *raid_bio)
if (!sh) {
/* failed to get a stripe - must wait */
- raid5_set_bi_processed_stripes(raid_bio, scnt);
conf->retry_read_aligned = raid_bio;
+ conf->retry_read_offset = scnt;
return handled;
}
if (!add_stripe_bio(sh, raid_bio, dd_idx, 0, 0)) {
raid5_release_stripe(sh);
- raid5_set_bi_processed_stripes(raid_bio, scnt);
conf->retry_read_aligned = raid_bio;
+ conf->retry_read_offset = scnt;
return handled;
}
@@ -5944,12 +6147,9 @@ static int retry_aligned_read(struct r5conf *conf, struct bio *raid_bio)
raid5_release_stripe(sh);
handled++;
}
- remaining = raid5_dec_bi_active_stripes(raid_bio);
- if (remaining == 0) {
- trace_block_bio_complete(bdev_get_queue(raid_bio->bi_bdev),
- raid_bio, 0);
- bio_endio(raid_bio);
- }
+
+ bio_endio(raid_bio);
+
if (atomic_dec_and_test(&conf->active_aligned_reads))
wake_up(&conf->wait_for_quiescent);
return handled;
@@ -5992,7 +6192,7 @@ static int handle_active_stripes(struct r5conf *conf, int group,
for (i = 0; i < batch_size; i++)
handle_stripe(batch[i]);
- r5l_write_stripe_run(conf->log);
+ log_write_stripe_run(conf);
cond_resched();
@@ -6009,6 +6209,7 @@ static void raid5_do_work(struct work_struct *work)
struct r5worker *worker = container_of(work, struct r5worker, work);
struct r5worker_group *group = worker->group;
struct r5conf *conf = group->conf;
+ struct mddev *mddev = conf->mddev;
int group_id = group - conf->worker_groups;
int handled;
struct blk_plug plug;
@@ -6029,6 +6230,9 @@ static void raid5_do_work(struct work_struct *work)
if (!batch_size && !released)
break;
handled += batch_size;
+ wait_event_lock_irq(mddev->sb_wait,
+ !test_bit(MD_SB_CHANGE_PENDING, &mddev->sb_flags),
+ conf->device_lock);
}
pr_debug("%d stripes handled\n", handled);
@@ -6056,24 +6260,13 @@ static void raid5d(struct md_thread *thread)
md_check_recovery(mddev);
- if (!bio_list_empty(&conf->return_bi) &&
- !test_bit(MD_SB_CHANGE_PENDING, &mddev->sb_flags)) {
- struct bio_list tmp = BIO_EMPTY_LIST;
- spin_lock_irq(&conf->device_lock);
- if (!test_bit(MD_SB_CHANGE_PENDING, &mddev->sb_flags)) {
- bio_list_merge(&tmp, &conf->return_bi);
- bio_list_init(&conf->return_bi);
- }
- spin_unlock_irq(&conf->device_lock);
- return_io(&tmp);
- }
-
blk_start_plug(&plug);
handled = 0;
spin_lock_irq(&conf->device_lock);
while (1) {
struct bio *bio;
int batch_size, released;
+ unsigned int offset;
released = release_stripe_list(conf, conf->temp_inactive_list);
if (released)
@@ -6091,10 +6284,10 @@ static void raid5d(struct md_thread *thread)
}
raid5_activate_delayed(conf);
- while ((bio = remove_bio_from_retry(conf))) {
+ while ((bio = remove_bio_from_retry(conf, &offset))) {
int ok;
spin_unlock_irq(&conf->device_lock);
- ok = retry_aligned_read(conf, bio);
+ ok = retry_aligned_read(conf, bio, offset);
spin_lock_irq(&conf->device_lock);
if (!ok)
break;
@@ -6126,6 +6319,8 @@ static void raid5d(struct md_thread *thread)
mutex_unlock(&conf->cache_size_mutex);
}
+ flush_deferred_bios(conf);
+
r5l_flush_stripe_to_raid(conf->log);
async_tx_issue_pending_all();
@@ -6151,7 +6346,6 @@ int
raid5_set_cache_size(struct mddev *mddev, int size)
{
struct r5conf *conf = mddev->private;
- int err;
if (size <= 16 || size > 32768)
return -EINVAL;
@@ -6163,10 +6357,7 @@ raid5_set_cache_size(struct mddev *mddev, int size)
;
mutex_unlock(&conf->cache_size_mutex);
-
- err = md_allow_write(mddev);
- if (err)
- return err;
+ md_allow_write(mddev);
mutex_lock(&conf->cache_size_mutex);
while (size > conf->max_nr_stripes)
@@ -6331,10 +6522,10 @@ raid5_store_skip_copy(struct mddev *mddev, const char *page, size_t len)
mddev_suspend(mddev);
conf->skip_copy = new;
if (new)
- mddev->queue->backing_dev_info.capabilities |=
+ mddev->queue->backing_dev_info->capabilities |=
BDI_CAP_STABLE_WRITES;
else
- mddev->queue->backing_dev_info.capabilities &=
+ mddev->queue->backing_dev_info->capabilities &=
~BDI_CAP_STABLE_WRITES;
mddev_resume(mddev);
}
@@ -6476,6 +6667,7 @@ static int alloc_thread_groups(struct r5conf *conf, int cnt,
group = &(*worker_groups)[i];
INIT_LIST_HEAD(&group->handle_list);
+ INIT_LIST_HEAD(&group->loprio_list);
group->conf = conf;
group->workers = workers + i * cnt;
@@ -6566,8 +6758,8 @@ static void free_conf(struct r5conf *conf)
{
int i;
- if (conf->log)
- r5l_exit_log(conf->log);
+ log_exit(conf);
+
if (conf->shrinker.nr_deferred)
unregister_shrinker(&conf->shrinker);
@@ -6578,7 +6770,10 @@ static void free_conf(struct r5conf *conf)
if (conf->disks[i].extra_page)
put_page(conf->disks[i].extra_page);
kfree(conf->disks);
+ if (conf->bio_split)
+ bioset_free(conf->bio_split);
kfree(conf->stripe_hashtbl);
+ kfree(conf->pending_data);
kfree(conf);
}
@@ -6688,6 +6883,14 @@ static struct r5conf *setup_conf(struct mddev *mddev)
conf = kzalloc(sizeof(struct r5conf), GFP_KERNEL);
if (conf == NULL)
goto abort;
+ INIT_LIST_HEAD(&conf->free_list);
+ INIT_LIST_HEAD(&conf->pending_list);
+ conf->pending_data = kzalloc(sizeof(struct r5pending_data) *
+ PENDING_IO_MAX, GFP_KERNEL);
+ if (!conf->pending_data)
+ goto abort;
+ for (i = 0; i < PENDING_IO_MAX; i++)
+ list_add(&conf->pending_data[i].sibling, &conf->free_list);
/* Don't enable multi-threading by default*/
if (!alloc_thread_groups(conf, 0, &group_cnt, &worker_cnt_per_group,
&new_group)) {
@@ -6703,14 +6906,25 @@ static struct r5conf *setup_conf(struct mddev *mddev)
init_waitqueue_head(&conf->wait_for_stripe);
init_waitqueue_head(&conf->wait_for_overlap);
INIT_LIST_HEAD(&conf->handle_list);
+ INIT_LIST_HEAD(&conf->loprio_list);
INIT_LIST_HEAD(&conf->hold_list);
INIT_LIST_HEAD(&conf->delayed_list);
INIT_LIST_HEAD(&conf->bitmap_list);
- bio_list_init(&conf->return_bi);
init_llist_head(&conf->released_stripes);
atomic_set(&conf->active_stripes, 0);
atomic_set(&conf->preread_active_stripes, 0);
atomic_set(&conf->active_aligned_reads, 0);
+ spin_lock_init(&conf->pending_bios_lock);
+ conf->batch_bio_dispatch = true;
+ rdev_for_each(rdev, mddev) {
+ if (test_bit(Journal, &rdev->flags))
+ continue;
+ if (blk_queue_nonrot(bdev_get_queue(rdev->bdev))) {
+ conf->batch_bio_dispatch = false;
+ break;
+ }
+ }
+
conf->bypass_threshold = BYPASS_THRESHOLD;
conf->recovery_disabled = mddev->recovery_disabled - 1;
@@ -6733,6 +6947,9 @@ static struct r5conf *setup_conf(struct mddev *mddev)
goto abort;
}
+ conf->bio_split = bioset_create(BIO_POOL_SIZE, 0, 0);
+ if (!conf->bio_split)
+ goto abort;
conf->mddev = mddev;
if ((conf->stripe_hashtbl = kzalloc(PAGE_SIZE, GFP_KERNEL)) == NULL)
@@ -6757,6 +6974,8 @@ static struct r5conf *setup_conf(struct mddev *mddev)
INIT_LIST_HEAD(&conf->r5c_full_stripe_list);
atomic_set(&conf->r5c_cached_partial_stripes, 0);
INIT_LIST_HEAD(&conf->r5c_partial_stripe_list);
+ atomic_set(&conf->r5c_flushing_full_stripes, 0);
+ atomic_set(&conf->r5c_flushing_partial_stripes, 0);
conf->level = mddev->new_level;
conf->chunk_sectors = mddev->new_chunk_sectors;
@@ -6903,6 +7122,9 @@ static int raid5_run(struct mddev *mddev)
long long min_offset_diff = 0;
int first = 1;
+ if (mddev_init_writes_pending(mddev) < 0)
+ return -ENOMEM;
+
if (mddev->recovery_cp != MaxSector)
pr_notice("md/raid:%s: not clean -- starting background reconstruction\n",
mdname(mddev));
@@ -7015,6 +7237,13 @@ static int raid5_run(struct mddev *mddev)
BUG_ON(mddev->delta_disks != 0);
}
+ if (test_bit(MD_HAS_JOURNAL, &mddev->flags) &&
+ test_bit(MD_HAS_PPL, &mddev->flags)) {
+ pr_warn("md/raid:%s: using journal device and PPL not allowed - disabling PPL\n",
+ mdname(mddev));
+ clear_bit(MD_HAS_PPL, &mddev->flags);
+ }
+
if (mddev->private == NULL)
conf = setup_conf(mddev);
else
@@ -7106,7 +7335,10 @@ static int raid5_run(struct mddev *mddev)
if (mddev->degraded > dirty_parity_disks &&
mddev->recovery_cp != MaxSector) {
- if (mddev->ok_start_degraded)
+ if (test_bit(MD_HAS_PPL, &mddev->flags))
+ pr_crit("md/raid:%s: starting dirty degraded array with PPL.\n",
+ mdname(mddev));
+ else if (mddev->ok_start_degraded)
pr_crit("md/raid:%s: starting dirty degraded array - data corruption possible.\n",
mdname(mddev));
else {
@@ -7145,7 +7377,6 @@ static int raid5_run(struct mddev *mddev)
if (mddev->queue) {
int chunk_size;
- bool discard_supported = true;
/* read-ahead size must cover two whole stripes, which
* is 2 * (datadisks) * chunksize where 'n' is the
* number of raid devices
@@ -7153,8 +7384,8 @@ static int raid5_run(struct mddev *mddev)
int data_disks = conf->previous_raid_disks - conf->max_degraded;
int stripe = data_disks *
((mddev->chunk_sectors << 9) / PAGE_SIZE);
- if (mddev->queue->backing_dev_info.ra_pages < 2 * stripe)
- mddev->queue->backing_dev_info.ra_pages = 2 * stripe;
+ if (mddev->queue->backing_dev_info->ra_pages < 2 * stripe)
+ mddev->queue->backing_dev_info->ra_pages = 2 * stripe;
chunk_size = mddev->chunk_sectors << 9;
blk_queue_io_min(mddev->queue, chunk_size);
@@ -7173,56 +7404,32 @@ static int raid5_run(struct mddev *mddev)
mddev->queue->limits.discard_alignment = stripe;
mddev->queue->limits.discard_granularity = stripe;
- /*
- * We use 16-bit counter of active stripes in bi_phys_segments
- * (minus one for over-loaded initialization)
- */
- blk_queue_max_hw_sectors(mddev->queue, 0xfffe * STRIPE_SECTORS);
- blk_queue_max_discard_sectors(mddev->queue,
- 0xfffe * STRIPE_SECTORS);
-
- /*
- * unaligned part of discard request will be ignored, so can't
- * guarantee discard_zeroes_data
- */
- mddev->queue->limits.discard_zeroes_data = 0;
-
blk_queue_max_write_same_sectors(mddev->queue, 0);
+ blk_queue_max_write_zeroes_sectors(mddev->queue, 0);
rdev_for_each(rdev, mddev) {
disk_stack_limits(mddev->gendisk, rdev->bdev,
rdev->data_offset << 9);
disk_stack_limits(mddev->gendisk, rdev->bdev,
rdev->new_data_offset << 9);
- /*
- * discard_zeroes_data is required, otherwise data
- * could be lost. Consider a scenario: discard a stripe
- * (the stripe could be inconsistent if
- * discard_zeroes_data is 0); write one disk of the
- * stripe (the stripe could be inconsistent again
- * depending on which disks are used to calculate
- * parity); the disk is broken; The stripe data of this
- * disk is lost.
- */
- if (!blk_queue_discard(bdev_get_queue(rdev->bdev)) ||
- !bdev_get_queue(rdev->bdev)->
- limits.discard_zeroes_data)
- discard_supported = false;
- /* Unfortunately, discard_zeroes_data is not currently
- * a guarantee - just a hint. So we only allow DISCARD
- * if the sysadmin has confirmed that only safe devices
- * are in use by setting a module parameter.
- */
- if (!devices_handle_discard_safely) {
- if (discard_supported) {
- pr_info("md/raid456: discard support disabled due to uncertainty.\n");
- pr_info("Set raid456.devices_handle_discard_safely=Y to override.\n");
- }
- discard_supported = false;
- }
}
- if (discard_supported &&
+ /*
+ * zeroing is required, otherwise data
+ * could be lost. Consider a scenario: discard a stripe
+ * (the stripe could be inconsistent if
+ * discard_zeroes_data is 0); write one disk of the
+ * stripe (the stripe could be inconsistent again
+ * depending on which disks are used to calculate
+ * parity); the disk is broken; The stripe data of this
+ * disk is lost.
+ *
+ * We only allow DISCARD if the sysadmin has confirmed that
+ * only safe devices are in use by setting a module parameter.
+ * A better idea might be to turn DISCARD into WRITE_ZEROES
+ * requests, as that is required to be safe.
+ */
+ if (devices_handle_discard_safely &&
mddev->queue->limits.max_discard_sectors >= (stripe >> 9) &&
mddev->queue->limits.discard_granularity >= stripe)
queue_flag_set_unlocked(QUEUE_FLAG_DISCARD,
@@ -7234,14 +7441,8 @@ static int raid5_run(struct mddev *mddev)
blk_queue_max_hw_sectors(mddev->queue, UINT_MAX);
}
- if (journal_dev) {
- char b[BDEVNAME_SIZE];
-
- pr_debug("md/raid:%s: using device %s as journal\n",
- mdname(mddev), bdevname(journal_dev->bdev, b));
- if (r5l_init_log(conf, journal_dev))
- goto abort;
- }
+ if (log_init(conf, journal_dev, raid5_has_ppl(conf)))
+ goto abort;
return 0;
abort:
@@ -7355,17 +7556,18 @@ static int raid5_remove_disk(struct mddev *mddev, struct md_rdev *rdev)
print_raid5_conf(conf);
if (test_bit(Journal, &rdev->flags) && conf->log) {
- struct r5l_log *log;
/*
* we can't wait pending write here, as this is called in
* raid5d, wait will deadlock.
+ * neilb: there is no locking about new writes here,
+ * so this cannot be safe.
*/
- if (atomic_read(&mddev->writes_pending))
+ if (atomic_read(&conf->active_stripes) ||
+ atomic_read(&conf->r5c_cached_full_stripes) ||
+ atomic_read(&conf->r5c_cached_partial_stripes)) {
return -EBUSY;
- log = conf->log;
- conf->log = NULL;
- synchronize_rcu();
- r5l_exit_log(log);
+ }
+ log_exit(conf);
return 0;
}
if (rdev == p->rdev)
@@ -7404,6 +7606,11 @@ static int raid5_remove_disk(struct mddev *mddev, struct md_rdev *rdev)
*rdevp = rdev;
}
}
+ if (!err) {
+ err = log_modify(conf, rdev, false);
+ if (err)
+ goto abort;
+ }
if (p->replacement) {
/* We must have just cleared 'rdev' */
p->rdev = p->replacement;
@@ -7412,12 +7619,12 @@ static int raid5_remove_disk(struct mddev *mddev, struct md_rdev *rdev)
* but will never see neither - if they are careful
*/
p->replacement = NULL;
- clear_bit(WantReplacement, &rdev->flags);
- } else
- /* We might have just removed the Replacement as faulty-
- * clear the bit just in case
- */
- clear_bit(WantReplacement, &rdev->flags);
+
+ if (!err)
+ err = log_modify(conf, p->rdev, true);
+ }
+
+ clear_bit(WantReplacement, &rdev->flags);
abort:
print_raid5_conf(conf);
@@ -7434,7 +7641,6 @@ static int raid5_add_disk(struct mddev *mddev, struct md_rdev *rdev)
int last = conf->raid_disks - 1;
if (test_bit(Journal, &rdev->flags)) {
- char b[BDEVNAME_SIZE];
if (conf->log)
return -EBUSY;
@@ -7443,9 +7649,7 @@ static int raid5_add_disk(struct mddev *mddev, struct md_rdev *rdev)
* The array is in readonly mode if journal is missing, so no
* write requests running. We should be safe
*/
- r5l_init_log(conf, rdev);
- pr_debug("md/raid:%s: using device %s as journal\n",
- mdname(mddev), bdevname(rdev->bdev, b));
+ log_init(conf, rdev, false);
return 0;
}
if (mddev->recovery_disabled == conf->recovery_disabled)
@@ -7472,10 +7676,12 @@ static int raid5_add_disk(struct mddev *mddev, struct md_rdev *rdev)
if (p->rdev == NULL) {
clear_bit(In_sync, &rdev->flags);
rdev->raid_disk = disk;
- err = 0;
if (rdev->saved_raid_disk != disk)
conf->fullsync = 1;
rcu_assign_pointer(p->rdev, rdev);
+
+ err = log_modify(conf, rdev, true);
+
goto out;
}
}
@@ -7509,7 +7715,7 @@ static int raid5_resize(struct mddev *mddev, sector_t sectors)
sector_t newsize;
struct r5conf *conf = mddev->private;
- if (conf->log)
+ if (conf->log || raid5_has_ppl(conf))
return -EINVAL;
sectors &= ~((sector_t)conf->chunk_sectors - 1);
newsize = raid5_size(mddev, sectors, mddev->raid_disks);
@@ -7522,8 +7728,6 @@ static int raid5_resize(struct mddev *mddev, sector_t sectors)
return ret;
}
md_set_array_sectors(mddev, newsize);
- set_capacity(mddev->gendisk, mddev->array_sectors);
- revalidate_disk(mddev->gendisk);
if (sectors > mddev->dev_sectors &&
mddev->recovery_cp > mddev->dev_sectors) {
mddev->recovery_cp = mddev->dev_sectors;
@@ -7562,7 +7766,7 @@ static int check_reshape(struct mddev *mddev)
{
struct r5conf *conf = mddev->private;
- if (conf->log)
+ if (conf->log || raid5_has_ppl(conf))
return -EINVAL;
if (mddev->delta_disks == 0 &&
mddev->new_layout == mddev->layout &&
@@ -7595,6 +7799,9 @@ static int check_reshape(struct mddev *mddev)
mddev->chunk_sectors)
) < 0)
return -ENOMEM;
+
+ if (conf->previous_raid_disks + mddev->delta_disks <= conf->pool_size)
+ return 0; /* never bother to shrink */
return resize_stripes(conf, (conf->previous_raid_disks
+ mddev->delta_disks));
}
@@ -7744,12 +7951,10 @@ static void end_reshape(struct r5conf *conf)
{
if (!test_bit(MD_RECOVERY_INTR, &conf->mddev->recovery)) {
- struct md_rdev *rdev;
spin_lock_irq(&conf->device_lock);
conf->previous_raid_disks = conf->raid_disks;
- rdev_for_each(rdev, conf->mddev)
- rdev->data_offset = rdev->new_data_offset;
+ md_finish_reshape(conf->mddev);
smp_wmb();
conf->reshape_progress = MaxSector;
conf->mddev->reshape_position = MaxSector;
@@ -7763,8 +7968,8 @@ static void end_reshape(struct r5conf *conf)
int data_disks = conf->raid_disks - conf->max_degraded;
int stripe = data_disks * ((conf->chunk_sectors << 9)
/ PAGE_SIZE);
- if (conf->mddev->queue->backing_dev_info.ra_pages < 2 * stripe)
- conf->mddev->queue->backing_dev_info.ra_pages = 2 * stripe;
+ if (conf->mddev->queue->backing_dev_info->ra_pages < 2 * stripe)
+ conf->mddev->queue->backing_dev_info->ra_pages = 2 * stripe;
}
}
}
@@ -8085,6 +8290,68 @@ static void *raid6_takeover(struct mddev *mddev)
return setup_conf(mddev);
}
+static int raid5_change_consistency_policy(struct mddev *mddev, const char *buf)
+{
+ struct r5conf *conf;
+ int err;
+
+ err = mddev_lock(mddev);
+ if (err)
+ return err;
+ conf = mddev->private;
+ if (!conf) {
+ mddev_unlock(mddev);
+ return -ENODEV;
+ }
+
+ if (strncmp(buf, "ppl", 3) == 0) {
+ /* ppl only works with RAID 5 */
+ if (!raid5_has_ppl(conf) && conf->level == 5) {
+ err = log_init(conf, NULL, true);
+ if (!err) {
+ err = resize_stripes(conf, conf->pool_size);
+ if (err)
+ log_exit(conf);
+ }
+ } else
+ err = -EINVAL;
+ } else if (strncmp(buf, "resync", 6) == 0) {
+ if (raid5_has_ppl(conf)) {
+ mddev_suspend(mddev);
+ log_exit(conf);
+ mddev_resume(mddev);
+ err = resize_stripes(conf, conf->pool_size);
+ } else if (test_bit(MD_HAS_JOURNAL, &conf->mddev->flags) &&
+ r5l_log_disk_error(conf)) {
+ bool journal_dev_exists = false;
+ struct md_rdev *rdev;
+
+ rdev_for_each(rdev, mddev)
+ if (test_bit(Journal, &rdev->flags)) {
+ journal_dev_exists = true;
+ break;
+ }
+
+ if (!journal_dev_exists) {
+ mddev_suspend(mddev);
+ clear_bit(MD_HAS_JOURNAL, &mddev->flags);
+ mddev_resume(mddev);
+ } else /* need remove journal device first */
+ err = -EBUSY;
+ } else
+ err = -EINVAL;
+ } else {
+ err = -EINVAL;
+ }
+
+ if (!err)
+ md_update_sb(mddev, 1);
+
+ mddev_unlock(mddev);
+
+ return err;
+}
+
static struct md_personality raid6_personality =
{
.name = "raid6",
@@ -8107,6 +8374,7 @@ static struct md_personality raid6_personality =
.quiesce = raid5_quiesce,
.takeover = raid6_takeover,
.congested = raid5_congested,
+ .change_consistency_policy = raid5_change_consistency_policy,
};
static struct md_personality raid5_personality =
{
@@ -8130,6 +8398,7 @@ static struct md_personality raid5_personality =
.quiesce = raid5_quiesce,
.takeover = raid5_takeover,
.congested = raid5_congested,
+ .change_consistency_policy = raid5_change_consistency_policy,
};
static struct md_personality raid4_personality =
@@ -8154,6 +8423,7 @@ static struct md_personality raid4_personality =
.quiesce = raid5_quiesce,
.takeover = raid4_takeover,
.congested = raid5_congested,
+ .change_consistency_policy = raid5_change_consistency_policy,
};
static int __init raid5_init(void)
diff --git a/drivers/md/raid5.h b/drivers/md/raid5.h
index 1440fa26e296..f6536399677a 100644
--- a/drivers/md/raid5.h
+++ b/drivers/md/raid5.h
@@ -224,10 +224,16 @@ struct stripe_head {
spinlock_t batch_lock; /* only header's lock is useful */
struct list_head batch_list; /* protected by head's batch lock*/
- struct r5l_io_unit *log_io;
+ union {
+ struct r5l_io_unit *log_io;
+ struct ppl_io_unit *ppl_io;
+ };
+
struct list_head log_list;
sector_t log_start; /* first meta block on the journal */
struct list_head r5c; /* for r5c_cache->stripe_in_journal */
+
+ struct page *ppl_page; /* partial parity of this stripe */
/**
* struct stripe_operations
* @target - STRIPE_OP_COMPUTE_BLK target
@@ -272,7 +278,6 @@ struct stripe_head_state {
int dec_preread_active;
unsigned long ops_request;
- struct bio_list return_bi;
struct md_rdev *blocked_rdev;
int handle_bad_blocks;
int log_failed;
@@ -400,6 +405,7 @@ enum {
STRIPE_OP_BIODRAIN,
STRIPE_OP_RECONSTRUCT,
STRIPE_OP_CHECK,
+ STRIPE_OP_PARTIAL_PARITY,
};
/*
@@ -481,50 +487,6 @@ static inline struct bio *r5_next_bio(struct bio *bio, sector_t sector)
return NULL;
}
-/*
- * We maintain a biased count of active stripes in the bottom 16 bits of
- * bi_phys_segments, and a count of processed stripes in the upper 16 bits
- */
-static inline int raid5_bi_processed_stripes(struct bio *bio)
-{
- atomic_t *segments = (atomic_t *)&bio->bi_phys_segments;
-
- return (atomic_read(segments) >> 16) & 0xffff;
-}
-
-static inline int raid5_dec_bi_active_stripes(struct bio *bio)
-{
- atomic_t *segments = (atomic_t *)&bio->bi_phys_segments;
-
- return atomic_sub_return(1, segments) & 0xffff;
-}
-
-static inline void raid5_inc_bi_active_stripes(struct bio *bio)
-{
- atomic_t *segments = (atomic_t *)&bio->bi_phys_segments;
-
- atomic_inc(segments);
-}
-
-static inline void raid5_set_bi_processed_stripes(struct bio *bio,
- unsigned int cnt)
-{
- atomic_t *segments = (atomic_t *)&bio->bi_phys_segments;
- int old, new;
-
- do {
- old = atomic_read(segments);
- new = (old & 0xffff) | (cnt << 16);
- } while (atomic_cmpxchg(segments, old, new) != old);
-}
-
-static inline void raid5_set_bi_stripes(struct bio *bio, unsigned int cnt)
-{
- atomic_t *segments = (atomic_t *)&bio->bi_phys_segments;
-
- atomic_set(segments, cnt);
-}
-
/* NOTE NR_STRIPE_HASH_LOCKS must remain below 64.
* This is because we sometimes take all the spinlocks
* and creating that much locking depth can cause
@@ -542,11 +504,22 @@ struct r5worker {
struct r5worker_group {
struct list_head handle_list;
+ struct list_head loprio_list;
struct r5conf *conf;
struct r5worker *workers;
int stripes_cnt;
};
+/*
+ * r5c journal modes of the array: write-back or write-through.
+ * write-through mode has identical behavior as existing log only
+ * implementation.
+ */
+enum r5c_journal_mode {
+ R5C_JOURNAL_MODE_WRITE_THROUGH = 0,
+ R5C_JOURNAL_MODE_WRITE_BACK = 1,
+};
+
enum r5_cache_state {
R5_INACTIVE_BLOCKED, /* release of inactive stripes blocked,
* waiting for 25% to be free
@@ -571,6 +544,14 @@ enum r5_cache_state {
*/
};
+#define PENDING_IO_MAX 512
+#define PENDING_IO_ONE_FLUSH 128
+struct r5pending_data {
+ struct list_head sibling;
+ sector_t sector; /* stripe sector */
+ struct bio_list bios;
+};
+
struct r5conf {
struct hlist_head *stripe_hashtbl;
/* only protect corresponding hash list and inactive_list */
@@ -608,10 +589,12 @@ struct r5conf {
*/
struct list_head handle_list; /* stripes needing handling */
+ struct list_head loprio_list; /* low priority stripes */
struct list_head hold_list; /* preread ready stripes */
struct list_head delayed_list; /* stripes that have plugged requests */
struct list_head bitmap_list; /* stripes delaying awaiting bitmap update */
struct bio *retry_read_aligned; /* currently retrying aligned bios */
+ unsigned int retry_read_offset; /* sector offset into retry_read_aligned */
struct bio *retry_read_aligned_list; /* aligned bios retry list */
atomic_t preread_active_stripes; /* stripes with scheduled io */
atomic_t active_aligned_reads;
@@ -621,9 +604,6 @@ struct r5conf {
int skip_copy; /* Don't copy data from bio to stripe cache */
struct list_head *last_hold; /* detect hold_list promotions */
- /* bios to have bi_end_io called after metadata is synced */
- struct bio_list return_bi;
-
atomic_t reshape_stripes; /* stripes with pending writes for reshape */
/* unfortunately we need two cache names as we temporarily have
* two caches.
@@ -663,6 +643,8 @@ struct r5conf {
struct list_head r5c_full_stripe_list;
atomic_t r5c_cached_partial_stripes;
struct list_head r5c_partial_stripe_list;
+ atomic_t r5c_flushing_full_stripes;
+ atomic_t r5c_flushing_partial_stripes;
atomic_t empty_inactive_list_nr;
struct llist_head released_stripes;
@@ -674,6 +656,7 @@ struct r5conf {
int pool_size; /* number of disks in stripeheads in pool */
spinlock_t device_lock;
struct disk_info *disks;
+ struct bio_set *bio_split;
/* When taking over an array from a different personality, we store
* the new thread here until we fully activate the array.
@@ -684,6 +667,15 @@ struct r5conf {
int group_cnt;
int worker_cnt_per_group;
struct r5l_log *log;
+ void *log_private;
+
+ spinlock_t pending_bios_lock;
+ bool batch_bio_dispatch;
+ struct r5pending_data *pending_data;
+ struct list_head free_list;
+ struct list_head pending_list;
+ int pending_data_cnt;
+ struct r5pending_data *next_pending_data;
};
@@ -759,33 +751,5 @@ extern struct stripe_head *
raid5_get_active_stripe(struct r5conf *conf, sector_t sector,
int previous, int noblock, int noquiesce);
extern int raid5_calc_degraded(struct r5conf *conf);
-extern int r5l_init_log(struct r5conf *conf, struct md_rdev *rdev);
-extern void r5l_exit_log(struct r5l_log *log);
-extern int r5l_write_stripe(struct r5l_log *log, struct stripe_head *head_sh);
-extern void r5l_write_stripe_run(struct r5l_log *log);
-extern void r5l_flush_stripe_to_raid(struct r5l_log *log);
-extern void r5l_stripe_write_finished(struct stripe_head *sh);
-extern int r5l_handle_flush_request(struct r5l_log *log, struct bio *bio);
-extern void r5l_quiesce(struct r5l_log *log, int state);
-extern bool r5l_log_disk_error(struct r5conf *conf);
-extern bool r5c_is_writeback(struct r5l_log *log);
-extern int
-r5c_try_caching_write(struct r5conf *conf, struct stripe_head *sh,
- struct stripe_head_state *s, int disks);
-extern void
-r5c_finish_stripe_write_out(struct r5conf *conf, struct stripe_head *sh,
- struct stripe_head_state *s);
-extern void r5c_release_extra_page(struct stripe_head *sh);
-extern void r5c_use_extra_page(struct stripe_head *sh);
-extern void r5l_wake_reclaim(struct r5l_log *log, sector_t space);
-extern void r5c_handle_cached_data_endio(struct r5conf *conf,
- struct stripe_head *sh, int disks, struct bio_list *return_bi);
-extern int r5c_cache_data(struct r5l_log *log, struct stripe_head *sh,
- struct stripe_head_state *s);
-extern void r5c_make_stripe_write_out(struct stripe_head *sh);
-extern void r5c_flush_cache(struct r5conf *conf, int num);
-extern void r5c_check_stripe_cache_usage(struct r5conf *conf);
-extern void r5c_check_cached_full_stripe(struct r5conf *conf);
-extern struct md_sysfs_entry r5c_journal_mode;
-extern void r5c_update_on_rdev_error(struct mddev *mddev);
+extern int r5c_journal_mode_set(struct mddev *mddev, int journal_mode);
#endif